diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index bf54c2f1d3c..ac2d952b965 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -193,12 +193,12 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
 
   # Set the source directory for s2n-bignum assembly files
   if(ARCH STREQUAL "x86_64")
-    set(S2N_BIGNUM_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/x86_att)
+    set(S2N_BIGNUM_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/s2n-bignum-imported/x86_att)
   else()
-    set(S2N_BIGNUM_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/arm)
+    set(S2N_BIGNUM_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/s2n-bignum-imported/arm)
   endif()
 
-  set(S2N_BIGNUM_INCLUDE_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/include)
+  set(S2N_BIGNUM_INCLUDE_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/s2n-bignum-imported/include)
 
   # We add s2n-bignum files to a separate list because they need
   # to go through C preprocessor in case of the static build.
@@ -290,16 +290,10 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
                 generic/bignum_optsub.S
                 generic/bignum_sqr.S
 
-                fastmul/bignum_kmul_16_32_neon.S
-                fastmul/bignum_kmul_32_64_neon.S
-                fastmul/bignum_ksqr_16_32_neon.S
-                fastmul/bignum_ksqr_32_64_neon.S
-                fastmul/bignum_emontredc_8n_neon.S
-
                 generic/bignum_copy_row_from_table.S
-                generic/bignum_copy_row_from_table_8n_neon.S
-                generic/bignum_copy_row_from_table_16_neon.S
-                generic/bignum_copy_row_from_table_32_neon.S
+                generic/bignum_copy_row_from_table_8n.S
+                generic/bignum_copy_row_from_table_16.S
+                generic/bignum_copy_row_from_table_32.S
     )
   endif()
 endif()
diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c
index a35658223b1..8713715b037 100644
--- a/crypto/fipsmodule/bn/exponentiation.c
+++ b/crypto/fipsmodule/bn/exponentiation.c
@@ -124,11 +124,13 @@
      defined(OPENSSL_OPENBSD) || defined(OPENSSL_FREEBSD)) &&                        \
     defined(OPENSSL_AARCH64)
 
-#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
+#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
 
 #define BN_EXPONENTIATION_S2N_BIGNUM_CAPABLE 1
 
-OPENSSL_INLINE int exponentiation_use_s2n_bignum(void) { return 1; }
+OPENSSL_INLINE int exponentiation_use_s2n_bignum(void) {
+  return CRYPTO_is_NEON_capable();
+}
 
 #else
 
@@ -143,17 +145,12 @@ static void exponentiation_s2n_bignum_copy_from_prebuf(BN_ULONG *dest, int width
 #if defined(BN_EXPONENTIATION_S2N_BIGNUM_CAPABLE)
 
   int table_height = 1 << window;
-  if (CRYPTO_is_NEON_capable()) {
-    if (width == 32) {
-      bignum_copy_row_from_table_32_neon(dest, table, table_height, rowidx);
-    } else if (width == 16) {
-      bignum_copy_row_from_table_16_neon(dest, table, table_height, rowidx);
-    } else if (width % 8 == 0) {
-      bignum_copy_row_from_table_8n_neon(dest, table, table_height, width,
-                                         rowidx);
-    } else {
-      bignum_copy_row_from_table(dest, table, table_height, width, rowidx);
-    }
+  if (width == 32) {
+    bignum_copy_row_from_table_32(dest, table, table_height, rowidx);
+  } else if (width == 16) {
+    bignum_copy_row_from_table_16(dest, table, table_height, rowidx);
+  } else if (width % 8 == 0) {
+    bignum_copy_row_from_table_8n(dest, table, table_height, width, rowidx);
   } else {
     bignum_copy_row_from_table(dest, table, table_height, width, rowidx);
   }
diff --git a/crypto/fipsmodule/bn/montgomery.c b/crypto/fipsmodule/bn/montgomery.c
index 38a651b9bbf..c7ac15c18d6 100644
--- a/crypto/fipsmodule/bn/montgomery.c
+++ b/crypto/fipsmodule/bn/montgomery.c
@@ -127,7 +127,7 @@
      defined(OPENSSL_OPENBSD) || defined(OPENSSL_FREEBSD)) &&                        \
     defined(OPENSSL_AARCH64) && defined(OPENSSL_BN_ASM_MONT)
 
-#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
+#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
 
 #define BN_MONTGOMERY_S2N_BIGNUM_CAPABLE 1
 
@@ -137,11 +137,14 @@ OPENSSL_INLINE int montgomery_use_s2n_bignum(unsigned int num) {
   // (2) num (which is the number of words) is multiplie of 8, because
   //     s2n-bignum's bignum_emontredc_8n requires it, and
   // (3) The word size is 64 bits.
+  // (4) CPU has NEON.
   assert(S2NBIGNUM_KSQR_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS &&
          S2NBIGNUM_KSQR_32_64_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS &&
          S2NBIGNUM_KMUL_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS);
   assert(BN_BITS2 == 64);
-  return !CRYPTO_is_ARMv8_wide_multiplier_capable() && (num % 8 == 0);
+  return !CRYPTO_is_ARMv8_wide_multiplier_capable() &&
+          (num % 8 == 0) &&
+          CRYPTO_is_NEON_capable();
 }
 
 #else
@@ -454,7 +457,7 @@ static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
 // are equivalent to the arguments of bn_mul_mont.
 // montgomery_s2n_bignum_mul_mont works only if num is a multiple of 8.
 // montgomery_use_s2n_bignum(num) must be called in advance to check this
-// condition.
+// condition, as well as other s2n-bignum requirements.
 // For num = 32 or num = 16, this uses faster primitives in s2n-bignum.
 // montgomery_s2n_bignum_mul_mont allocates S2NBIGNUM_KMUL_32_64_TEMP_NWORDS +
 // 2 * BN_MONTGOMERY_MAX_WORDS uint64_t words at the stack.
@@ -477,34 +480,23 @@ static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap,
   uint64_t w = n0[0];
 
   if (num == 32) {
-    if (CRYPTO_is_NEON_capable()) {
-      if (ap == bp)
-        bignum_ksqr_32_64_neon(mulres, ap, t);
-      else
-        bignum_kmul_32_64_neon(mulres, ap, bp, t);
+    if (ap == bp) {
+      bignum_ksqr_32_64(mulres, ap, t);
     } else {
-      if (ap == bp)
-        bignum_ksqr_32_64(mulres, ap, t);
-      else
-        bignum_kmul_32_64(mulres, ap, bp, t);
+      bignum_kmul_32_64(mulres, ap, bp, t);
     }
   } else if (num == 16) {
-    if (CRYPTO_is_NEON_capable()) {
-      if (ap == bp)
-        bignum_ksqr_16_32_neon(mulres, ap, t);
-      else
-        bignum_kmul_16_32_neon(mulres, ap, bp, t);
+    if (ap == bp) {
+      bignum_ksqr_16_32(mulres, ap, t);
     } else {
-      if (ap == bp)
-        bignum_ksqr_16_32(mulres, ap, t);
-      else
-        bignum_kmul_16_32(mulres, ap, bp, t);
+      bignum_kmul_16_32(mulres, ap, bp, t);
     }
   } else {
-    if (ap == bp)
+    if (ap == bp) {
       bignum_sqr(num * 2, mulres, num, ap);
-    else
+    } else {
       bignum_mul(num * 2, mulres, num, ap, num, bp);
+    }
   }
 
   // Do montgomery reduction. We follow the definition of montgomery reduction
@@ -518,9 +510,7 @@ static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap,
   //    A. The result of step 1 >= 2^(64*num), meaning that bignum_emontredc_8n
   //       returned 1. Since m is less than 2^(64*num), (result of step 1) >= m holds.
   //    B. The result of step 1 fits in 2^(64*num), and the result >= m.
-  uint64_t c = CRYPTO_is_NEON_capable() ? 
-               bignum_emontredc_8n_neon(num, mulres, np, w) :
-               bignum_emontredc_8n(num, mulres, np, w); // c: case A
+  uint64_t c = bignum_emontredc_8n(num, mulres, np, w); // c: case A
   c |= bignum_ge(num, mulres + num, num, np);  // c: case B
   // Optionally subtract and store the result at rp
   bignum_optsub(num, rp, mulres + num, c, np);
diff --git a/crypto/fipsmodule/curve25519/curve25519_s2n_bignum_asm.c b/crypto/fipsmodule/curve25519/curve25519_s2n_bignum_asm.c
index af225f6784f..64a1ed5e703 100644
--- a/crypto/fipsmodule/curve25519/curve25519_s2n_bignum_asm.c
+++ b/crypto/fipsmodule/curve25519/curve25519_s2n_bignum_asm.c
@@ -5,7 +5,7 @@
 #include "../cpucap/internal.h"
 
 #if defined(CURVE25519_S2N_BIGNUM_CAPABLE)
-#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
+#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
 
 void x25519_scalar_mult_generic_s2n_bignum(
   uint8_t out_shared_key[X25519_SHARED_KEY_LEN],
diff --git a/crypto/fipsmodule/ec/p256-nistz.c b/crypto/fipsmodule/ec/p256-nistz.c
index 3e8afd6aadb..5b2431f0c2a 100644
--- a/crypto/fipsmodule/ec/p256-nistz.c
+++ b/crypto/fipsmodule/ec/p256-nistz.c
@@ -34,7 +34,7 @@
 #include "ec_nistp.h"
 
 #if defined(EC_NISTP_USE_S2N_BIGNUM)
-#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
+#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
 #endif
 
 #if !defined(OPENSSL_NO_ASM) &&  \
diff --git a/crypto/fipsmodule/ec/p384.c b/crypto/fipsmodule/ec/p384.c
index ba5d780a797..581969f34a7 100644
--- a/crypto/fipsmodule/ec/p384.c
+++ b/crypto/fipsmodule/ec/p384.c
@@ -19,7 +19,7 @@
 #if !defined(OPENSSL_SMALL)
 
 #if defined(EC_NISTP_USE_S2N_BIGNUM)
-#  include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
+#  include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
 #else
 #  if defined(EC_NISTP_USE_64BIT_LIMB)
 #    include "../../../third_party/fiat/p384_64.h"
diff --git a/crypto/fipsmodule/ec/p521.c b/crypto/fipsmodule/ec/p521.c
index db45e518503..ff27c8f7a8b 100644
--- a/crypto/fipsmodule/ec/p521.c
+++ b/crypto/fipsmodule/ec/p521.c
@@ -22,7 +22,7 @@
 #if !defined(OPENSSL_SMALL)
 
 #if defined(EC_NISTP_USE_S2N_BIGNUM)
-#  include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
+#  include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
 #else
 #  if defined(EC_NISTP_USE_64BIT_LIMB)
 #    include "../../../third_party/fiat/p521_64.h"
diff --git a/third_party/s2n-bignum/META.yml b/third_party/s2n-bignum/META.yml
new file mode 100644
index 00000000000..4949b2bf609
--- /dev/null
+++ b/third_party/s2n-bignum/META.yml
@@ -0,0 +1,5 @@
+name: s2n-bignum-imported
+source: awslabs/s2n-bignum.git
+commit: 54e1fa5756d6b13961c2f61d90f75426aa25d373
+target: main
+imported-at: 2025-04-28T17:22:07+0000
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32.S b/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32.S
deleted file mode 100644
index e45dd487e1f..00000000000
--- a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32.S
+++ /dev/null
@@ -1,798 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Multiply z := x * y
-// Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32]
-//
-//    extern void bignum_kmul_16_32
-//     (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16],
-//      uint64_t t[static 32])
-//
-// This is a Karatsuba-style function multiplying half-sized results
-// internally and using temporary buffer t for intermediate results.
-//
-// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = t
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_16_32)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_16_32)
-        .text
-        .balign 4
-
-// Subroutine-safe copies of the output, inputs and temporary buffer pointers
-
-#define z x25
-#define x x26
-#define y x27
-#define t x28
-
-// More variables for sign masks, with s also necessarily subroutine-safe
-
-#define s x29
-#define m x19
-
-S2N_BN_SYMBOL(bignum_kmul_16_32):
-
-// Save registers, including return address
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x26, [sp, #-16]!
-        stp     x27, x28, [sp, #-16]!
-        stp     x29, x30, [sp, #-16]!
-
-// Move parameters into subroutine-safe places
-
-        mov     z, x0
-        mov     x, x1
-        mov     y, x2
-        mov     t, x3
-
-// Compute L = x_lo * y_lo in bottom half of buffer (size 8 x 8 -> 16)
-
-        bl      bignum_kmul_16_32_local_mul_8_16
-
-// Compute absolute difference [t..] = |x_lo - x_hi|
-// and the sign s = sgn(x_lo - x_hi) as a bitmask (all 1s for negative)
-
-        ldp     x10, x11, [x]
-        ldp     x8, x9, [x, #64]
-        subs    x10, x10, x8
-        sbcs    x11, x11, x9
-        ldp     x12, x13, [x, #16]
-        ldp     x8, x9, [x, #80]
-        sbcs    x12, x12, x8
-        sbcs    x13, x13, x9
-        ldp     x14, x15, [x, #32]
-        ldp     x8, x9, [x, #96]
-        sbcs    x14, x14, x8
-        sbcs    x15, x15, x9
-        ldp     x16, x17, [x, #48]
-        ldp     x8, x9, [x, #112]
-        sbcs    x16, x16, x8
-        sbcs    x17, x17, x9
-        csetm   s, cc
-        adds    xzr, s, s
-        eor     x10, x10, s
-        adcs    x10, x10, xzr
-        eor     x11, x11, s
-        adcs    x11, x11, xzr
-        stp     x10, x11, [t]
-        eor     x12, x12, s
-        adcs    x12, x12, xzr
-        eor     x13, x13, s
-        adcs    x13, x13, xzr
-        stp     x12, x13, [t, #16]
-        eor     x14, x14, s
-        adcs    x14, x14, xzr
-        eor     x15, x15, s
-        adcs    x15, x15, xzr
-        stp     x14, x15, [t, #32]
-        eor     x16, x16, s
-        adcs    x16, x16, xzr
-        eor     x17, x17, s
-        adcs    x17, x17, xzr
-        stp     x16, x17, [t, #48]
-
-// Compute H = x_hi * y_hi in top half of buffer (size 8 x 8 -> 16)
-
-        add     x0, z, #128
-        add     x1, x, #64
-        add     x2, y, #64
-        bl      bignum_kmul_16_32_local_mul_8_16
-
-// Compute the other absolute difference [t+8..] = |y_hi - y_lo|
-// Collect the combined product sign bitmask (all 1s for negative) in s
-
-        ldp     x10, x11, [y]
-        ldp     x8, x9, [y, #64]
-        subs    x10, x8, x10
-        sbcs    x11, x9, x11
-        ldp     x12, x13, [y, #16]
-        ldp     x8, x9, [y, #80]
-        sbcs    x12, x8, x12
-        sbcs    x13, x9, x13
-        ldp     x14, x15, [y, #32]
-        ldp     x8, x9, [y, #96]
-        sbcs    x14, x8, x14
-        sbcs    x15, x9, x15
-        ldp     x16, x17, [y, #48]
-        ldp     x8, x9, [y, #112]
-        sbcs    x16, x8, x16
-        sbcs    x17, x9, x17
-        csetm   m, cc
-        adds    xzr, m, m
-        eor     x10, x10, m
-        adcs    x10, x10, xzr
-        eor     x11, x11, m
-        adcs    x11, x11, xzr
-        stp     x10, x11, [t, #64]
-        eor     x12, x12, m
-        adcs    x12, x12, xzr
-        eor     x13, x13, m
-        adcs    x13, x13, xzr
-        stp     x12, x13, [t, #80]
-        eor     x14, x14, m
-        adcs    x14, x14, xzr
-        eor     x15, x15, m
-        adcs    x15, x15, xzr
-        stp     x14, x15, [t, #96]
-        eor     x16, x16, m
-        adcs    x16, x16, xzr
-        eor     x17, x17, m
-        adcs    x17, x17, xzr
-        stp     x16, x17, [t, #112]
-        eor     s, s, m
-
-// Compute H' = H + L_top in place of H (it cannot overflow)
-// First add 8-sized block then propagate carry through next 8
-
-        ldp     x10, x11, [z, #128]
-        ldp     x12, x13, [z, #64]
-        adds    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [z, #128]
-
-        ldp     x10, x11, [z, #128+16]
-        ldp     x12, x13, [z, #64+16]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [z, #128+16]
-
-        ldp     x10, x11, [z, #128+32]
-        ldp     x12, x13, [z, #64+32]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [z, #128+32]
-
-        ldp     x10, x11, [z, #128+48]
-        ldp     x12, x13, [z, #64+48]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [z, #128+48]
-
-        ldp     x10, x11, [z, #128+64]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [z, #128+64]
-
-        ldp     x10, x11, [z, #128+80]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [z, #128+80]
-
-        ldp     x10, x11, [z, #128+96]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [z, #128+96]
-
-        ldp     x10, x11, [z, #128+112]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [z, #128+112]
-
-// Compute M = |x_lo - x_hi| * |y_hi - y_lo| in [t+16...], size 16
-
-        add     x0, t, #128
-        mov     x1, t
-        add     x2, t, #64
-        bl      bignum_kmul_16_32_local_mul_8_16
-
-// Add the interlocking H' and L_bot terms, storing in registers x15..x0
-// Intercept the carry at the 8 + 16 = 24 position and store it in x.
-// (Note that we no longer need the input x was pointing at.)
-
-        ldp     x0, x1, [z]
-        ldp     x16, x17, [z, #128]
-        adds    x0, x0, x16
-        adcs    x1, x1, x17
-        ldp     x2, x3, [z, #16]
-        ldp     x16, x17, [z, #144]
-        adcs    x2, x2, x16
-        adcs    x3, x3, x17
-        ldp     x4, x5, [z, #32]
-        ldp     x16, x17, [z, #160]
-        adcs    x4, x4, x16
-        adcs    x5, x5, x17
-        ldp     x6, x7, [z, #48]
-        ldp     x16, x17, [z, #176]
-        adcs    x6, x6, x16
-        adcs    x7, x7, x17
-        ldp     x8, x9, [z, #128]
-        ldp     x16, x17, [z, #192]
-        adcs    x8, x8, x16
-        adcs    x9, x9, x17
-        ldp     x10, x11, [z, #144]
-        ldp     x16, x17, [z, #208]
-        adcs    x10, x10, x16
-        adcs    x11, x11, x17
-        ldp     x12, x13, [z, #160]
-        ldp     x16, x17, [z, #224]
-        adcs    x12, x12, x16
-        adcs    x13, x13, x17
-        ldp     x14, x15, [z, #176]
-        ldp     x16, x17, [z, #240]
-        adcs    x14, x14, x16
-        adcs    x15, x15, x17
-
-        cset    x, cs
-
-// Add the sign-adjusted mid-term cross product M
-
-        cmn     s, s
-
-        ldp     x16, x17, [t, #128]
-        eor     x16, x16, s
-        adcs    x0, x0, x16
-        eor     x17, x17, s
-        adcs    x1, x1, x17
-        stp     x0, x1, [z, #64]
-        ldp     x16, x17, [t, #144]
-        eor     x16, x16, s
-        adcs    x2, x2, x16
-        eor     x17, x17, s
-        adcs    x3, x3, x17
-        stp     x2, x3, [z, #80]
-        ldp     x16, x17, [t, #160]
-        eor     x16, x16, s
-        adcs    x4, x4, x16
-        eor     x17, x17, s
-        adcs    x5, x5, x17
-        stp     x4, x5, [z, #96]
-        ldp     x16, x17, [t, #176]
-        eor     x16, x16, s
-        adcs    x6, x6, x16
-        eor     x17, x17, s
-        adcs    x7, x7, x17
-        stp     x6, x7, [z, #112]
-        ldp     x16, x17, [t, #192]
-        eor     x16, x16, s
-        adcs    x8, x8, x16
-        eor     x17, x17, s
-        adcs    x9, x9, x17
-        stp     x8, x9, [z, #128]
-        ldp     x16, x17, [t, #208]
-        eor     x16, x16, s
-        adcs    x10, x10, x16
-        eor     x17, x17, s
-        adcs    x11, x11, x17
-        stp     x10, x11, [z, #144]
-        ldp     x16, x17, [t, #224]
-        eor     x16, x16, s
-        adcs    x12, x12, x16
-        eor     x17, x17, s
-        adcs    x13, x13, x17
-        stp     x12, x13, [z, #160]
-        ldp     x16, x17, [t, #240]
-        eor     x16, x16, s
-        adcs    x14, x14, x16
-        eor     x17, x17, s
-        adcs    x15, x15, x17
-        stp     x14, x15, [z, #176]
-
-// Get the next digits effectively resulting so far starting at 24
-
-        adcs    y, s, x
-        adc     t, s, xzr
-
-// Now the final 8 digits of padding; the first one is special in using y
-// and also in getting the carry chain started
-
-        ldp     x10, x11, [z, #192]
-        adds    x10, x10, y
-        adcs    x11, x11, t
-        stp     x10, x11, [z, #192]
-        ldp     x10, x11, [z, #208]
-        adcs    x10, x10, t
-        adcs    x11, x11, t
-        stp     x10, x11, [z, #208]
-        ldp     x10, x11, [z, #224]
-        adcs    x10, x10, t
-        adcs    x11, x11, t
-        stp     x10, x11, [z, #224]
-        ldp     x10, x11, [z, #240]
-        adcs    x10, x10, t
-        adcs    x11, x11, t
-        stp     x10, x11, [z, #240]
-
-// Restore registers and return
-
-        ldp     x29, x30, [sp], #16
-        ldp     x27, x28, [sp], #16
-        ldp     x25, x26, [sp], #16
-        ldp     x23, x24, [sp], #16
-        ldp     x21, x22, [sp], #16
-        ldp     x19, x20, [sp], #16
-
-        ret
-
-// -----------------------------------------------------------------------
-// Local copy of bignum_mul_8_16 without the scratch register save/restore
-// -----------------------------------------------------------------------
-
-bignum_kmul_16_32_local_mul_8_16:
-        ldp     x3, x4, [x1]
-        ldp     x7, x8, [x2]
-        ldp     x5, x6, [x1, #16]
-        ldp     x9, x10, [x2, #16]
-        mul     x11, x3, x7
-        mul     x15, x4, x8
-        mul     x16, x5, x9
-        mul     x17, x6, x10
-        umulh   x19, x3, x7
-        adds    x15, x15, x19
-        umulh   x19, x4, x8
-        adcs    x16, x16, x19
-        umulh   x19, x5, x9
-        adcs    x17, x17, x19
-        umulh   x19, x6, x10
-        adc     x19, x19, xzr
-        adds    x12, x15, x11
-        adcs    x15, x16, x15
-        adcs    x16, x17, x16
-        adcs    x17, x19, x17
-        adc     x19, xzr, x19
-        adds    x13, x15, x11
-        adcs    x14, x16, x12
-        adcs    x15, x17, x15
-        adcs    x16, x19, x16
-        adcs    x17, xzr, x17
-        adc     x19, xzr, x19
-        subs    x24, x5, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x9
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x16, x16, x22
-        eor     x21, x21, x20
-        adcs    x17, x17, x21
-        adc     x19, x19, x20
-        subs    x24, x3, x4
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x8, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x12, x12, x22
-        eor     x21, x21, x20
-        adcs    x13, x13, x21
-        adcs    x14, x14, x20
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x15, x15, x22
-        eor     x21, x21, x20
-        adcs    x16, x16, x21
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x13, x13, x22
-        eor     x21, x21, x20
-        adcs    x14, x14, x21
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        ldp     x3, x4, [x1, #32]
-        stp     x11, x12, [x0]
-        ldp     x7, x8, [x2, #32]
-        stp     x13, x14, [x0, #16]
-        ldp     x5, x6, [x1, #48]
-        stp     x15, x16, [x0, #32]
-        ldp     x9, x10, [x2, #48]
-        stp     x17, x19, [x0, #48]
-        mul     x11, x3, x7
-        mul     x15, x4, x8
-        mul     x16, x5, x9
-        mul     x17, x6, x10
-        umulh   x19, x3, x7
-        adds    x15, x15, x19
-        umulh   x19, x4, x8
-        adcs    x16, x16, x19
-        umulh   x19, x5, x9
-        adcs    x17, x17, x19
-        umulh   x19, x6, x10
-        adc     x19, x19, xzr
-        adds    x12, x15, x11
-        adcs    x15, x16, x15
-        adcs    x16, x17, x16
-        adcs    x17, x19, x17
-        adc     x19, xzr, x19
-        adds    x13, x15, x11
-        adcs    x14, x16, x12
-        adcs    x15, x17, x15
-        adcs    x16, x19, x16
-        adcs    x17, xzr, x17
-        adc     x19, xzr, x19
-        ldp     x22, x21, [x0, #32]
-        adds    x11, x11, x22
-        adcs    x12, x12, x21
-        ldp     x22, x21, [x0, #48]
-        adcs    x13, x13, x22
-        adcs    x14, x14, x21
-        adcs    x15, x15, xzr
-        adcs    x16, x16, xzr
-        adcs    x17, x17, xzr
-        adc     x19, x19, xzr
-        subs    x24, x5, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x9
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x16, x16, x22
-        eor     x21, x21, x20
-        adcs    x17, x17, x21
-        adc     x19, x19, x20
-        subs    x24, x3, x4
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x8, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x12, x12, x22
-        eor     x21, x21, x20
-        adcs    x13, x13, x21
-        adcs    x14, x14, x20
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x15, x15, x22
-        eor     x21, x21, x20
-        adcs    x16, x16, x21
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x13, x13, x22
-        eor     x21, x21, x20
-        adcs    x14, x14, x21
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        ldp     x22, x21, [x1]
-        subs    x3, x3, x22
-        sbcs    x4, x4, x21
-        ldp     x22, x21, [x1, #16]
-        sbcs    x5, x5, x22
-        sbcs    x6, x6, x21
-        csetm   x24, cc
-        stp     x11, x12, [x0, #64]
-        ldp     x22, x21, [x2]
-        subs    x7, x22, x7
-        sbcs    x8, x21, x8
-        ldp     x22, x21, [x2, #16]
-        sbcs    x9, x22, x9
-        sbcs    x10, x21, x10
-        csetm   x1, cc
-        stp     x13, x14, [x0, #80]
-        eor     x3, x3, x24
-        subs    x3, x3, x24
-        eor     x4, x4, x24
-        sbcs    x4, x4, x24
-        eor     x5, x5, x24
-        sbcs    x5, x5, x24
-        eor     x6, x6, x24
-        sbc     x6, x6, x24
-        stp     x15, x16, [x0, #96]
-        eor     x7, x7, x1
-        subs    x7, x7, x1
-        eor     x8, x8, x1
-        sbcs    x8, x8, x1
-        eor     x9, x9, x1
-        sbcs    x9, x9, x1
-        eor     x10, x10, x1
-        sbc     x10, x10, x1
-        stp     x17, x19, [x0, #112]
-        eor     x1, x1, x24
-        mul     x11, x3, x7
-        mul     x15, x4, x8
-        mul     x16, x5, x9
-        mul     x17, x6, x10
-        umulh   x19, x3, x7
-        adds    x15, x15, x19
-        umulh   x19, x4, x8
-        adcs    x16, x16, x19
-        umulh   x19, x5, x9
-        adcs    x17, x17, x19
-        umulh   x19, x6, x10
-        adc     x19, x19, xzr
-        adds    x12, x15, x11
-        adcs    x15, x16, x15
-        adcs    x16, x17, x16
-        adcs    x17, x19, x17
-        adc     x19, xzr, x19
-        adds    x13, x15, x11
-        adcs    x14, x16, x12
-        adcs    x15, x17, x15
-        adcs    x16, x19, x16
-        adcs    x17, xzr, x17
-        adc     x19, xzr, x19
-        subs    x24, x5, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x9
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x16, x16, x22
-        eor     x21, x21, x20
-        adcs    x17, x17, x21
-        adc     x19, x19, x20
-        subs    x24, x3, x4
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x8, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x12, x12, x22
-        eor     x21, x21, x20
-        adcs    x13, x13, x21
-        adcs    x14, x14, x20
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x15, x15, x22
-        eor     x21, x21, x20
-        adcs    x16, x16, x21
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x13, x13, x22
-        eor     x21, x21, x20
-        adcs    x14, x14, x21
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        ldp     x3, x4, [x0]
-        ldp     x7, x8, [x0, #64]
-        adds    x3, x3, x7
-        adcs    x4, x4, x8
-        ldp     x5, x6, [x0, #16]
-        ldp     x9, x10, [x0, #80]
-        adcs    x5, x5, x9
-        adcs    x6, x6, x10
-        ldp     x20, x21, [x0, #96]
-        adcs    x7, x7, x20
-        adcs    x8, x8, x21
-        ldp     x22, x23, [x0, #112]
-        adcs    x9, x9, x22
-        adcs    x10, x10, x23
-        adcs    x24, x1, xzr
-        adc     x2, x1, xzr
-        cmn     x1, #0x1
-        eor     x11, x11, x1
-        adcs    x3, x11, x3
-        eor     x12, x12, x1
-        adcs    x4, x12, x4
-        eor     x13, x13, x1
-        adcs    x5, x13, x5
-        eor     x14, x14, x1
-        adcs    x6, x14, x6
-        eor     x15, x15, x1
-        adcs    x7, x15, x7
-        eor     x16, x16, x1
-        adcs    x8, x16, x8
-        eor     x17, x17, x1
-        adcs    x9, x17, x9
-        eor     x19, x19, x1
-        adcs    x10, x19, x10
-        adcs    x20, x20, x24
-        adcs    x21, x21, x2
-        adcs    x22, x22, x2
-        adc     x23, x23, x2
-        stp     x3, x4, [x0, #32]
-        stp     x5, x6, [x0, #48]
-        stp     x7, x8, [x0, #64]
-        stp     x9, x10, [x0, #80]
-        stp     x20, x21, [x0, #96]
-        stp     x22, x23, [x0, #112]
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64.S b/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64.S
deleted file mode 100644
index e45249462ac..00000000000
--- a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64.S
+++ /dev/null
@@ -1,1348 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Multiply z := x * y
-// Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96]
-//
-//    extern void bignum_kmul_32_64
-//     (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32],
-//      uint64_t t[static 96])
-//
-// This is a Karatsuba-style function multiplying half-sized results
-// internally and using temporary buffer t for intermediate results.
-//
-// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = t
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_32_64)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_32_64)
-        .text
-        .balign 4
-
-#define K 16
-#define L 8 // this is (K/2)
-
-#define z x19
-#define x x20
-#define y x21
-#define t x22
-
-#define c x16
-
-S2N_BN_SYMBOL(bignum_kmul_32_64):
-
-// Save extra registers and return address, store parameters safely
-
-        stp     x19, x20, [sp, -16]!
-        stp     x21, x22, [sp, -16]!
-        stp     x23, x24, [sp, -16]!
-        stp     x25, x26, [sp, -16]!
-        stp     x27, x28, [sp, -16]!
-        stp     x29, x30, [sp, -16]!
-
-        mov     z, x0
-        mov     x, x1
-        mov     y, x2
-        mov     t, x3
-
-// Compute L = x_lo * y_lo in bottom half of buffer (size 16 x 16 -> 32)
-
-        bl      bignum_kmul_32_64_local_kmul_16_32
-
-// Compute H = x_hi * y_hi in top half of buffer (size 16 x 16 -> 32)
-
-        add     x0, z, #16*K
-        add     x1, x, #8*K
-        add     x2, y, #8*K
-        mov     x3, t
-        bl      bignum_kmul_32_64_local_kmul_16_32
-
-// Compute absolute difference [t..] = |x_lo - x_hi|
-// and the sign x = sgn(x_lo - x_hi) as a bitmask (all 1s for negative)
-// Note that we overwrite the pointer x itself with this sign,
-// which is safe since we no longer need it.
-
-        ldp     x0, x1, [x, #128]
-        ldp     x16, x17, [x]
-        subs    x0, x0, x16
-        sbcs    x1, x1, x17
-
-        ldp     x2, x3, [x, #144]
-        ldp     x16, x17, [x, #16]
-        sbcs    x2, x2, x16
-        sbcs    x3, x3, x17
-
-        ldp     x4, x5, [x, #160]
-        ldp     x16, x17, [x, #32]
-        sbcs    x4, x4, x16
-        sbcs    x5, x5, x17
-
-        ldp     x6, x7, [x, #176]
-        ldp     x16, x17, [x, #48]
-        sbcs    x6, x6, x16
-        sbcs    x7, x7, x17
-
-        ldp     x8, x9, [x, #192]
-        ldp     x16, x17, [x, #64]
-        sbcs    x8, x8, x16
-        sbcs    x9, x9, x17
-
-        ldp     x10, x11, [x, #208]
-        ldp     x16, x17, [x, #80]
-        sbcs    x10, x10, x16
-        sbcs    x11, x11, x17
-
-        ldp     x12, x13, [x, #224]
-        ldp     x16, x17, [x, #96]
-        sbcs    x12, x12, x16
-        sbcs    x13, x13, x17
-
-        ldp     x14, x15, [x, #240]
-        ldp     x16, x17, [x, #112]
-        sbcs    x14, x14, x16
-        sbcs    x15, x15, x17
-
-        sbc     x, xzr, xzr
-
-        adds    xzr, x, x
-
-        eor     x0, x0, x
-        adcs    x0, x0, xzr
-        eor     x1, x1, x
-        adcs    x1, x1, xzr
-        stp     x0, x1, [t]
-
-        eor     x2, x2, x
-        adcs    x2, x2, xzr
-        eor     x3, x3, x
-        adcs    x3, x3, xzr
-        stp     x2, x3, [t, #16]
-
-        eor     x4, x4, x
-        adcs    x4, x4, xzr
-        eor     x5, x5, x
-        adcs    x5, x5, xzr
-        stp     x4, x5, [t, #32]
-
-        eor     x6, x6, x
-        adcs    x6, x6, xzr
-        eor     x7, x7, x
-        adcs    x7, x7, xzr
-        stp     x6, x7, [t, #48]
-
-        eor     x8, x8, x
-        adcs    x8, x8, xzr
-        eor     x9, x9, x
-        adcs    x9, x9, xzr
-        stp     x8, x9, [t, #64]
-
-        eor     x10, x10, x
-        adcs    x10, x10, xzr
-        eor     x11, x11, x
-        adcs    x11, x11, xzr
-        stp     x10, x11, [t, #80]
-
-        eor     x12, x12, x
-        adcs    x12, x12, xzr
-        eor     x13, x13, x
-        adcs    x13, x13, xzr
-        stp     x12, x13, [t, #96]
-
-        eor     x14, x14, x
-        adcs    x14, x14, xzr
-        eor     x15, x15, x
-        adc     x15, x15, xzr
-        stp     x14, x15, [t, #112]
-
-// Compute the other absolute difference [t+8*K..] = |y_hi - y_lo|
-// Collect the combined product sign bitmask (all 1s for negative) as
-// y = sgn((x_lo - x_hi) * (y_hi - y_lo)), overwriting the y pointer.
-
-        ldp     x0, x1, [y]
-        ldp     x16, x17, [y, #128]
-        subs    x0, x0, x16
-        sbcs    x1, x1, x17
-
-        ldp     x2, x3, [y, #16]
-        ldp     x16, x17, [y, #144]
-        sbcs    x2, x2, x16
-        sbcs    x3, x3, x17
-
-        ldp     x4, x5, [y, #32]
-        ldp     x16, x17, [y, #160]
-        sbcs    x4, x4, x16
-        sbcs    x5, x5, x17
-
-        ldp     x6, x7, [y, #48]
-        ldp     x16, x17, [y, #176]
-        sbcs    x6, x6, x16
-        sbcs    x7, x7, x17
-
-        ldp     x8, x9, [y, #64]
-        ldp     x16, x17, [y, #192]
-        sbcs    x8, x8, x16
-        sbcs    x9, x9, x17
-
-        ldp     x10, x11, [y, #80]
-        ldp     x16, x17, [y, #208]
-        sbcs    x10, x10, x16
-        sbcs    x11, x11, x17
-
-        ldp     x12, x13, [y, #96]
-        ldp     x16, x17, [y, #224]
-        sbcs    x12, x12, x16
-        sbcs    x13, x13, x17
-
-        ldp     x14, x15, [y, #112]
-        ldp     x16, x17, [y, #240]
-        sbcs    x14, x14, x16
-        sbcs    x15, x15, x17
-
-        sbc     y, xzr, xzr
-
-        adds    xzr, y, y
-
-        eor     x0, x0, y
-        adcs    x0, x0, xzr
-        eor     x1, x1, y
-        adcs    x1, x1, xzr
-        stp     x0, x1, [t, #128]
-
-        eor     x2, x2, y
-        adcs    x2, x2, xzr
-        eor     x3, x3, y
-        adcs    x3, x3, xzr
-        stp     x2, x3, [t, #128+16]
-
-        eor     x4, x4, y
-        adcs    x4, x4, xzr
-        eor     x5, x5, y
-        adcs    x5, x5, xzr
-        stp     x4, x5, [t, #128+32]
-
-        eor     x6, x6, y
-        adcs    x6, x6, xzr
-        eor     x7, x7, y
-        adcs    x7, x7, xzr
-        stp     x6, x7, [t, #128+48]
-
-        eor     x8, x8, y
-        adcs    x8, x8, xzr
-        eor     x9, x9, y
-        adcs    x9, x9, xzr
-        stp     x8, x9, [t, #128+64]
-
-        eor     x10, x10, y
-        adcs    x10, x10, xzr
-        eor     x11, x11, y
-        adcs    x11, x11, xzr
-        stp     x10, x11, [t, #128+80]
-
-        eor     x12, x12, y
-        adcs    x12, x12, xzr
-        eor     x13, x13, y
-        adcs    x13, x13, xzr
-        stp     x12, x13, [t, #128+96]
-
-        eor     x14, x14, y
-        adcs    x14, x14, xzr
-        eor     x15, x15, y
-        adc     x15, x15, xzr
-        stp     x14, x15, [t, #128+112]
-
-        eor     y, y, x
-
-// Compute H' = H + L_top in place of H (it cannot overflow)
-
-        ldp     x0, x1, [z, #16*16]
-        ldp     x2, x3, [z, #16*L]
-        adds    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*16]
-
-        ldp     x0, x1, [z, #16*17]
-        ldp     x2, x3, [z, #16*9]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*17]
-
-        ldp     x0, x1, [z, #16*18]
-        ldp     x2, x3, [z, #16*10]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*18]
-
-        ldp     x0, x1, [z, #16*19]
-        ldp     x2, x3, [z, #16*11]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*19]
-
-        ldp     x0, x1, [z, #16*20]
-        ldp     x2, x3, [z, #16*12]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*20]
-
-        ldp     x0, x1, [z, #16*21]
-        ldp     x2, x3, [z, #16*13]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*21]
-
-        ldp     x0, x1, [z, #16*22]
-        ldp     x2, x3, [z, #16*14]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*22]
-
-        ldp     x0, x1, [z, #16*23]
-        ldp     x2, x3, [z, #16*15]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*23]
-
-        ldp     x0, x1, [z, #16*24]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*24]
-
-        ldp     x0, x1, [z, #16*25]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*25]
-
-        ldp     x0, x1, [z, #16*26]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*26]
-
-        ldp     x0, x1, [z, #16*27]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*27]
-
-        ldp     x0, x1, [z, #16*28]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*28]
-
-        ldp     x0, x1, [z, #16*29]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*29]
-
-        ldp     x0, x1, [z, #16*30]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*30]
-
-        ldp     x0, x1, [z, #16*31]
-        adcs    x0, x0, xzr
-        adc     x1, x1, xzr
-        stp     x0, x1, [z, #16*31]
-
-// Compute M = |x_lo - x_hi| * |y_hi - y_lo|, size 32
-
-        add     x0, t, #16*K
-        mov     x1, t
-        add     x2, t, #8*K
-        add     x3, t, #32*K
-        bl      bignum_kmul_32_64_local_kmul_16_32
-
-// Add the interlocking H' and L_bot terms
-// Intercept the carry at the 3k position and store it in x.
-// Again, we no longer need the input x was pointing at.
-
-        ldp     x0, x1, [z, #16*16]
-        ldp     x2, x3, [z]
-        adds    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*8]
-
-        ldp     x0, x1, [z, #16*17]
-        ldp     x2, x3, [z, #16*1]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*9]
-
-        ldp     x0, x1, [z, #16*18]
-        ldp     x2, x3, [z, #16*2]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*10]
-
-        ldp     x0, x1, [z, #16*19]
-        ldp     x2, x3, [z, #16*3]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*11]
-
-        ldp     x0, x1, [z, #16*20]
-        ldp     x2, x3, [z, #16*4]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*12]
-
-        ldp     x0, x1, [z, #16*21]
-        ldp     x2, x3, [z, #16*5]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*13]
-
-        ldp     x0, x1, [z, #16*22]
-        ldp     x2, x3, [z, #16*6]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*14]
-
-        ldp     x0, x1, [z, #16*23]
-        ldp     x2, x3, [z, #16*7]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*15]
-
-        ldp     x0, x1, [z, #16*16]
-        ldp     x2, x3, [z, #16*24]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*16]
-
-        ldp     x0, x1, [z, #16*17]
-        ldp     x2, x3, [z, #16*25]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*17]
-
-        ldp     x0, x1, [z, #16*18]
-        ldp     x2, x3, [z, #16*26]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*18]
-
-        ldp     x0, x1, [z, #16*19]
-        ldp     x2, x3, [z, #16*27]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*19]
-
-        ldp     x0, x1, [z, #16*20]
-        ldp     x2, x3, [z, #16*28]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*20]
-
-        ldp     x0, x1, [z, #16*21]
-        ldp     x2, x3, [z, #16*29]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*21]
-
-        ldp     x0, x1, [z, #16*22]
-        ldp     x2, x3, [z, #16*30]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*22]
-
-        ldp     x0, x1, [z, #16*23]
-        ldp     x2, x3, [z, #16*31]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*23]
-
-        cset      x, cs
-
-// Add the sign-adjusted mid-term cross product M
-
-        cmn     y, y
-
-        ldp     x0, x1, [z, #128]
-        ldp     x2, x3, [t, #128+128]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #128]
-
-        ldp     x0, x1, [z, #144]
-        ldp     x2, x3, [t, #128+144]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #144]
-
-        ldp     x0, x1, [z, #160]
-        ldp     x2, x3, [t, #128+160]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #160]
-
-        ldp     x0, x1, [z, #176]
-        ldp     x2, x3, [t, #128+176]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #176]
-
-        ldp     x0, x1, [z, #192]
-        ldp     x2, x3, [t, #128+192]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #192]
-
-        ldp     x0, x1, [z, #208]
-        ldp     x2, x3, [t, #128+208]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #208]
-
-        ldp     x0, x1, [z, #224]
-        ldp     x2, x3, [t, #128+224]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #224]
-
-        ldp     x0, x1, [z, #240]
-        ldp     x2, x3, [t, #128+240]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #240]
-
-        ldp     x0, x1, [z, #256]
-        ldp     x2, x3, [t, #128+256]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #256]
-
-        ldp     x0, x1, [z, #272]
-        ldp     x2, x3, [t, #128+272]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #272]
-
-        ldp     x0, x1, [z, #288]
-        ldp     x2, x3, [t, #128+288]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #288]
-
-        ldp     x0, x1, [z, #304]
-        ldp     x2, x3, [t, #128+304]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #304]
-
-        ldp     x0, x1, [z, #320]
-        ldp     x2, x3, [t, #128+320]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #320]
-
-        ldp     x0, x1, [z, #336]
-        ldp     x2, x3, [t, #128+336]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #336]
-
-        ldp     x0, x1, [z, #352]
-        ldp     x2, x3, [t, #128+352]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #352]
-
-        ldp     x0, x1, [z, #368]
-        ldp     x2, x3, [t, #128+368]
-        eor     x2, x2, y
-        adcs    x0, x0, x2
-        eor     x3, x3, y
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #368]
-
-// Get the next digits effectively resulting so far starting at 3k
-// [...,c,c,c,c,x]
-
-        adcs    x, y, x
-        adc     c, y, xzr
-
-// Now propagate through the top quarter of the result
-
-        ldp     x0, x1, [z, #16*24]
-        adds    x0, x0, x
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*24]
-
-        ldp     x0, x1, [z, #16*25]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*25]
-
-        ldp     x0, x1, [z, #16*26]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*26]
-
-        ldp     x0, x1, [z, #16*27]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*27]
-
-        ldp     x0, x1, [z, #16*28]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*28]
-
-        ldp     x0, x1, [z, #16*29]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*29]
-
-        ldp     x0, x1, [z, #16*30]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*30]
-
-        ldp     x0, x1, [z, #16*31]
-        adcs    x0, x0, c
-        adc     x1, x1, c
-        stp     x0, x1, [z, #16*31]
-
-// Restore and return
-
-        ldp     x29, x30, [sp], #16
-        ldp     x27, x28, [sp], #16
-        ldp     x25, x26, [sp], #16
-        ldp     x23, x24, [sp], #16
-        ldp     x21, x22, [sp], #16
-        ldp     x19, x20, [sp], #16
-        ret
-
-// Local copy of bignum_kmul_16_32, identical to main one except that it
-// only preserves the key registers we need to be stable in the main code.
-// This includes in turn a copy of bignum_mul_8_16.
-
-bignum_kmul_32_64_local_kmul_16_32:
-        stp     x19, x20, [sp, -16]!
-        stp     x21, x22, [sp, -16]!
-        stp     x23, x30, [sp, -16]!
-        mov     x25, x0
-        mov     x26, x1
-        mov     x27, x2
-        mov     x28, x3
-        bl      bignum_kmul_32_64_local_mul_8_16
-        ldp     x10, x11, [x26]
-        ldp     x8, x9, [x26, #64]
-        subs    x10, x10, x8
-        sbcs    x11, x11, x9
-        ldp     x12, x13, [x26, #16]
-        ldp     x8, x9, [x26, #80]
-        sbcs    x12, x12, x8
-        sbcs    x13, x13, x9
-        ldp     x14, x15, [x26, #32]
-        ldp     x8, x9, [x26, #96]
-        sbcs    x14, x14, x8
-        sbcs    x15, x15, x9
-        ldp     x16, x17, [x26, #48]
-        ldp     x8, x9, [x26, #112]
-        sbcs    x16, x16, x8
-        sbcs    x17, x17, x9
-        csetm   x29, cc
-        cmn     x29, x29
-        eor     x10, x10, x29
-        adcs    x10, x10, xzr
-        eor     x11, x11, x29
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x28]
-        eor     x12, x12, x29
-        adcs    x12, x12, xzr
-        eor     x13, x13, x29
-        adcs    x13, x13, xzr
-        stp     x12, x13, [x28, #16]
-        eor     x14, x14, x29
-        adcs    x14, x14, xzr
-        eor     x15, x15, x29
-        adcs    x15, x15, xzr
-        stp     x14, x15, [x28, #32]
-        eor     x16, x16, x29
-        adcs    x16, x16, xzr
-        eor     x17, x17, x29
-        adcs    x17, x17, xzr
-        stp     x16, x17, [x28, #48]
-        add     x0, x25, #0x80
-        add     x1, x26, #0x40
-        add     x2, x27, #0x40
-        bl      bignum_kmul_32_64_local_mul_8_16
-        ldp     x10, x11, [x27]
-        ldp     x8, x9, [x27, #64]
-        subs    x10, x8, x10
-        sbcs    x11, x9, x11
-        ldp     x12, x13, [x27, #16]
-        ldp     x8, x9, [x27, #80]
-        sbcs    x12, x8, x12
-        sbcs    x13, x9, x13
-        ldp     x14, x15, [x27, #32]
-        ldp     x8, x9, [x27, #96]
-        sbcs    x14, x8, x14
-        sbcs    x15, x9, x15
-        ldp     x16, x17, [x27, #48]
-        ldp     x8, x9, [x27, #112]
-        sbcs    x16, x8, x16
-        sbcs    x17, x9, x17
-        csetm   x19, cc
-        cmn     x19, x19
-        eor     x10, x10, x19
-        adcs    x10, x10, xzr
-        eor     x11, x11, x19
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x28, #64]
-        eor     x12, x12, x19
-        adcs    x12, x12, xzr
-        eor     x13, x13, x19
-        adcs    x13, x13, xzr
-        stp     x12, x13, [x28, #80]
-        eor     x14, x14, x19
-        adcs    x14, x14, xzr
-        eor     x15, x15, x19
-        adcs    x15, x15, xzr
-        stp     x14, x15, [x28, #96]
-        eor     x16, x16, x19
-        adcs    x16, x16, xzr
-        eor     x17, x17, x19
-        adcs    x17, x17, xzr
-        stp     x16, x17, [x28, #112]
-        eor     x29, x29, x19
-        ldp     x10, x11, [x25, #128]
-        ldp     x12, x13, [x25, #64]
-        adds    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [x25, #128]
-        ldp     x10, x11, [x25, #144]
-        ldp     x12, x13, [x25, #80]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [x25, #144]
-        ldp     x10, x11, [x25, #160]
-        ldp     x12, x13, [x25, #96]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [x25, #160]
-        ldp     x10, x11, [x25, #176]
-        ldp     x12, x13, [x25, #112]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [x25, #176]
-        ldp     x10, x11, [x25, #192]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x25, #192]
-        ldp     x10, x11, [x25, #208]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x25, #208]
-        ldp     x10, x11, [x25, #224]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x25, #224]
-        ldp     x10, x11, [x25, #240]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x25, #240]
-        add     x0, x28, #0x80
-        mov     x1, x28
-        add     x2, x28, #0x40
-        bl      bignum_kmul_32_64_local_mul_8_16
-        ldp     x0, x1, [x25]
-        ldp     x16, x17, [x25, #128]
-        adds    x0, x0, x16
-        adcs    x1, x1, x17
-        ldp     x2, x3, [x25, #16]
-        ldp     x16, x17, [x25, #144]
-        adcs    x2, x2, x16
-        adcs    x3, x3, x17
-        ldp     x4, x5, [x25, #32]
-        ldp     x16, x17, [x25, #160]
-        adcs    x4, x4, x16
-        adcs    x5, x5, x17
-        ldp     x6, x7, [x25, #48]
-        ldp     x16, x17, [x25, #176]
-        adcs    x6, x6, x16
-        adcs    x7, x7, x17
-        ldp     x8, x9, [x25, #128]
-        ldp     x16, x17, [x25, #192]
-        adcs    x8, x8, x16
-        adcs    x9, x9, x17
-        ldp     x10, x11, [x25, #144]
-        ldp     x16, x17, [x25, #208]
-        adcs    x10, x10, x16
-        adcs    x11, x11, x17
-        ldp     x12, x13, [x25, #160]
-        ldp     x16, x17, [x25, #224]
-        adcs    x12, x12, x16
-        adcs    x13, x13, x17
-        ldp     x14, x15, [x25, #176]
-        ldp     x16, x17, [x25, #240]
-        adcs    x14, x14, x16
-        adcs    x15, x15, x17
-        cset    x26, cs
-        cmn     x29, x29
-        ldp     x16, x17, [x28, #128]
-        eor     x16, x16, x29
-        adcs    x0, x0, x16
-        eor     x17, x17, x29
-        adcs    x1, x1, x17
-        stp     x0, x1, [x25, #64]
-        ldp     x16, x17, [x28, #144]
-        eor     x16, x16, x29
-        adcs    x2, x2, x16
-        eor     x17, x17, x29
-        adcs    x3, x3, x17
-        stp     x2, x3, [x25, #80]
-        ldp     x16, x17, [x28, #160]
-        eor     x16, x16, x29
-        adcs    x4, x4, x16
-        eor     x17, x17, x29
-        adcs    x5, x5, x17
-        stp     x4, x5, [x25, #96]
-        ldp     x16, x17, [x28, #176]
-        eor     x16, x16, x29
-        adcs    x6, x6, x16
-        eor     x17, x17, x29
-        adcs    x7, x7, x17
-        stp     x6, x7, [x25, #112]
-        ldp     x16, x17, [x28, #192]
-        eor     x16, x16, x29
-        adcs    x8, x8, x16
-        eor     x17, x17, x29
-        adcs    x9, x9, x17
-        stp     x8, x9, [x25, #128]
-        ldp     x16, x17, [x28, #208]
-        eor     x16, x16, x29
-        adcs    x10, x10, x16
-        eor     x17, x17, x29
-        adcs    x11, x11, x17
-        stp     x10, x11, [x25, #144]
-        ldp     x16, x17, [x28, #224]
-        eor     x16, x16, x29
-        adcs    x12, x12, x16
-        eor     x17, x17, x29
-        adcs    x13, x13, x17
-        stp     x12, x13, [x25, #160]
-        ldp     x16, x17, [x28, #240]
-        eor     x16, x16, x29
-        adcs    x14, x14, x16
-        eor     x17, x17, x29
-        adcs    x15, x15, x17
-        stp     x14, x15, [x25, #176]
-        adcs    x27, x29, x26
-        adc     x28, x29, xzr
-        ldp     x10, x11, [x25, #192]
-        adds    x10, x10, x27
-        adcs    x11, x11, x28
-        stp     x10, x11, [x25, #192]
-        ldp     x10, x11, [x25, #208]
-        adcs    x10, x10, x28
-        adcs    x11, x11, x28
-        stp     x10, x11, [x25, #208]
-        ldp     x10, x11, [x25, #224]
-        adcs    x10, x10, x28
-        adcs    x11, x11, x28
-        stp     x10, x11, [x25, #224]
-        ldp     x10, x11, [x25, #240]
-        adcs    x10, x10, x28
-        adcs    x11, x11, x28
-        stp     x10, x11, [x25, #240]
-        ldp     x23, x30, [sp], #16
-        ldp     x21, x22, [sp], #16
-        ldp     x19, x20, [sp], #16
-        ret
-
-bignum_kmul_32_64_local_mul_8_16:
-        ldp     x3, x4, [x1]
-        ldp     x7, x8, [x2]
-        ldp     x5, x6, [x1, #16]
-        ldp     x9, x10, [x2, #16]
-        mul     x11, x3, x7
-        mul     x15, x4, x8
-        mul     x16, x5, x9
-        mul     x17, x6, x10
-        umulh   x19, x3, x7
-        adds    x15, x15, x19
-        umulh   x19, x4, x8
-        adcs    x16, x16, x19
-        umulh   x19, x5, x9
-        adcs    x17, x17, x19
-        umulh   x19, x6, x10
-        adc     x19, x19, xzr
-        adds    x12, x15, x11
-        adcs    x15, x16, x15
-        adcs    x16, x17, x16
-        adcs    x17, x19, x17
-        adc     x19, xzr, x19
-        adds    x13, x15, x11
-        adcs    x14, x16, x12
-        adcs    x15, x17, x15
-        adcs    x16, x19, x16
-        adcs    x17, xzr, x17
-        adc     x19, xzr, x19
-        subs    x24, x5, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x9
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x16, x16, x22
-        eor     x21, x21, x20
-        adcs    x17, x17, x21
-        adc     x19, x19, x20
-        subs    x24, x3, x4
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x8, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x12, x12, x22
-        eor     x21, x21, x20
-        adcs    x13, x13, x21
-        adcs    x14, x14, x20
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x15, x15, x22
-        eor     x21, x21, x20
-        adcs    x16, x16, x21
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x13, x13, x22
-        eor     x21, x21, x20
-        adcs    x14, x14, x21
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        ldp     x3, x4, [x1, #32]
-        stp     x11, x12, [x0]
-        ldp     x7, x8, [x2, #32]
-        stp     x13, x14, [x0, #16]
-        ldp     x5, x6, [x1, #48]
-        stp     x15, x16, [x0, #32]
-        ldp     x9, x10, [x2, #48]
-        stp     x17, x19, [x0, #48]
-        mul     x11, x3, x7
-        mul     x15, x4, x8
-        mul     x16, x5, x9
-        mul     x17, x6, x10
-        umulh   x19, x3, x7
-        adds    x15, x15, x19
-        umulh   x19, x4, x8
-        adcs    x16, x16, x19
-        umulh   x19, x5, x9
-        adcs    x17, x17, x19
-        umulh   x19, x6, x10
-        adc     x19, x19, xzr
-        adds    x12, x15, x11
-        adcs    x15, x16, x15
-        adcs    x16, x17, x16
-        adcs    x17, x19, x17
-        adc     x19, xzr, x19
-        adds    x13, x15, x11
-        adcs    x14, x16, x12
-        adcs    x15, x17, x15
-        adcs    x16, x19, x16
-        adcs    x17, xzr, x17
-        adc     x19, xzr, x19
-        ldp     x22, x21, [x0, #32]
-        adds    x11, x11, x22
-        adcs    x12, x12, x21
-        ldp     x22, x21, [x0, #48]
-        adcs    x13, x13, x22
-        adcs    x14, x14, x21
-        adcs    x15, x15, xzr
-        adcs    x16, x16, xzr
-        adcs    x17, x17, xzr
-        adc     x19, x19, xzr
-        subs    x24, x5, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x9
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x16, x16, x22
-        eor     x21, x21, x20
-        adcs    x17, x17, x21
-        adc     x19, x19, x20
-        subs    x24, x3, x4
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x8, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x12, x12, x22
-        eor     x21, x21, x20
-        adcs    x13, x13, x21
-        adcs    x14, x14, x20
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x15, x15, x22
-        eor     x21, x21, x20
-        adcs    x16, x16, x21
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x13, x13, x22
-        eor     x21, x21, x20
-        adcs    x14, x14, x21
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        ldp     x22, x21, [x1]
-        subs    x3, x3, x22
-        sbcs    x4, x4, x21
-        ldp     x22, x21, [x1, #16]
-        sbcs    x5, x5, x22
-        sbcs    x6, x6, x21
-        csetm   x24, cc
-        stp     x11, x12, [x0, #64]
-        ldp     x22, x21, [x2]
-        subs    x7, x22, x7
-        sbcs    x8, x21, x8
-        ldp     x22, x21, [x2, #16]
-        sbcs    x9, x22, x9
-        sbcs    x10, x21, x10
-        csetm   x1, cc
-        stp     x13, x14, [x0, #80]
-        eor     x3, x3, x24
-        subs    x3, x3, x24
-        eor     x4, x4, x24
-        sbcs    x4, x4, x24
-        eor     x5, x5, x24
-        sbcs    x5, x5, x24
-        eor     x6, x6, x24
-        sbc     x6, x6, x24
-        stp     x15, x16, [x0, #96]
-        eor     x7, x7, x1
-        subs    x7, x7, x1
-        eor     x8, x8, x1
-        sbcs    x8, x8, x1
-        eor     x9, x9, x1
-        sbcs    x9, x9, x1
-        eor     x10, x10, x1
-        sbc     x10, x10, x1
-        stp     x17, x19, [x0, #112]
-        eor     x1, x1, x24
-        mul     x11, x3, x7
-        mul     x15, x4, x8
-        mul     x16, x5, x9
-        mul     x17, x6, x10
-        umulh   x19, x3, x7
-        adds    x15, x15, x19
-        umulh   x19, x4, x8
-        adcs    x16, x16, x19
-        umulh   x19, x5, x9
-        adcs    x17, x17, x19
-        umulh   x19, x6, x10
-        adc     x19, x19, xzr
-        adds    x12, x15, x11
-        adcs    x15, x16, x15
-        adcs    x16, x17, x16
-        adcs    x17, x19, x17
-        adc     x19, xzr, x19
-        adds    x13, x15, x11
-        adcs    x14, x16, x12
-        adcs    x15, x17, x15
-        adcs    x16, x19, x16
-        adcs    x17, xzr, x17
-        adc     x19, xzr, x19
-        subs    x24, x5, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x9
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x16, x16, x22
-        eor     x21, x21, x20
-        adcs    x17, x17, x21
-        adc     x19, x19, x20
-        subs    x24, x3, x4
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x8, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x12, x12, x22
-        eor     x21, x21, x20
-        adcs    x13, x13, x21
-        adcs    x14, x14, x20
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x15, x15, x22
-        eor     x21, x21, x20
-        adcs    x16, x16, x21
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x13, x13, x22
-        eor     x21, x21, x20
-        adcs    x14, x14, x21
-        adcs    x15, x15, x20
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x3, x6
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x10, x7
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        subs    x24, x4, x5
-        cneg    x24, x24, cc
-        csetm   x20, cc
-        subs    x21, x9, x8
-        cneg    x21, x21, cc
-        mul     x22, x24, x21
-        umulh   x21, x24, x21
-        cinv    x20, x20, cc
-        cmn     x20, #0x1
-        eor     x22, x22, x20
-        adcs    x14, x14, x22
-        eor     x21, x21, x20
-        adcs    x15, x15, x21
-        adcs    x16, x16, x20
-        adcs    x17, x17, x20
-        adc     x19, x19, x20
-        ldp     x3, x4, [x0]
-        ldp     x7, x8, [x0, #64]
-        adds    x3, x3, x7
-        adcs    x4, x4, x8
-        ldp     x5, x6, [x0, #16]
-        ldp     x9, x10, [x0, #80]
-        adcs    x5, x5, x9
-        adcs    x6, x6, x10
-        ldp     x20, x21, [x0, #96]
-        adcs    x7, x7, x20
-        adcs    x8, x8, x21
-        ldp     x22, x23, [x0, #112]
-        adcs    x9, x9, x22
-        adcs    x10, x10, x23
-        adcs    x24, x1, xzr
-        adc     x2, x1, xzr
-        cmn     x1, #0x1
-        eor     x11, x11, x1
-        adcs    x3, x11, x3
-        eor     x12, x12, x1
-        adcs    x4, x12, x4
-        eor     x13, x13, x1
-        adcs    x5, x13, x5
-        eor     x14, x14, x1
-        adcs    x6, x14, x6
-        eor     x15, x15, x1
-        adcs    x7, x15, x7
-        eor     x16, x16, x1
-        adcs    x8, x16, x8
-        eor     x17, x17, x1
-        adcs    x9, x17, x9
-        eor     x19, x19, x1
-        adcs    x10, x19, x10
-        adcs    x20, x20, x24
-        adcs    x21, x21, x2
-        adcs    x22, x22, x2
-        adc     x23, x23, x2
-        stp     x3, x4, [x0, #32]
-        stp     x5, x6, [x0, #48]
-        stp     x7, x8, [x0, #64]
-        stp     x9, x10, [x0, #80]
-        stp     x20, x21, [x0, #96]
-        stp     x22, x23, [x0, #112]
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32_neon.S b/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32_neon.S
deleted file mode 100644
index 6be2bcb3846..00000000000
--- a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32_neon.S
+++ /dev/null
@@ -1,658 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Square, z := x^2
-// Input x[16]; output z[32]; temporary buffer t[>=24]
-//
-//    extern void bignum_ksqr_16_32_neon
-//     (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]);
-//
-// This is a Karatsuba-style function squaring half-sized results
-// and using temporary buffer t for intermediate results.
-//
-// Standard ARM ABI: X0 = z, X1 = x, X2 = t
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_16_32_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_16_32_neon)
-        .text
-        .balign 4
-
-// Subroutine-safe copies of the output, inputs and temporary buffer pointers
-
-#define z x23
-#define x x24
-#define t x25
-
-// More variables for sign masks, with s also necessarily subroutine-safe
-
-#define s x19
-
-
-S2N_BN_SYMBOL(bignum_ksqr_16_32_neon):
-
-// Save registers, including return address
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x30, [sp, #-16]!
-
-// Move parameters into subroutine-safe places
-
-        mov     z, x0
-        mov     x, x1
-        mov     t, x2
-
-// Compute L = x_lo * y_lo in bottom half of buffer (size 8 x 8 -> 16)
-
-        bl      bignum_ksqr_16_32_neon_local_sqr_8_16
-
-// Compute absolute difference [t..] = |x_lo - x_hi|
-
-        ldp     x10, x11, [x]
-        ldp     x8, x9, [x, #64]
-        subs    x10, x10, x8
-        sbcs    x11, x11, x9
-        ldp     x12, x13, [x, #16]
-        ldp     x8, x9, [x, #80]
-        sbcs    x12, x12, x8
-        sbcs    x13, x13, x9
-        ldp     x14, x15, [x, #32]
-        ldp     x8, x9, [x, #96]
-        sbcs    x14, x14, x8
-        sbcs    x15, x15, x9
-        ldp     x16, x17, [x, #48]
-        ldp     x8, x9, [x, #112]
-        sbcs    x16, x16, x8
-        sbcs    x17, x17, x9
-        csetm   s, cc
-        adds    xzr, s, s
-        eor     x10, x10, s
-        adcs    x10, x10, xzr
-        eor     x11, x11, s
-        adcs    x11, x11, xzr
-        stp     x10, x11, [t]
-        eor     x12, x12, s
-        adcs    x12, x12, xzr
-        eor     x13, x13, s
-        adcs    x13, x13, xzr
-        stp     x12, x13, [t, #16]
-        eor     x14, x14, s
-        adcs    x14, x14, xzr
-        eor     x15, x15, s
-        adcs    x15, x15, xzr
-        stp     x14, x15, [t, #32]
-        eor     x16, x16, s
-        adcs    x16, x16, xzr
-        eor     x17, x17, s
-        adcs    x17, x17, xzr
-        stp     x16, x17, [t, #48]
-
-// Compute H = x_hi * y_hi in top half of buffer (size 8 x 8 -> 16)
-
-        add     x0, z, #128
-        add     x1, x, #64
-        bl      bignum_ksqr_16_32_neon_local_sqr_8_16
-
-// Compute H' = H + L_top in place of H (it cannot overflow)
-// First add 8-sized block then propagate carry through next 8
-
-        ldp     x10, x11, [z, #128]
-        ldp     x12, x13, [z, #64]
-        adds    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [z, #128]
-
-        ldp     x10, x11, [z, #128+16]
-        ldp     x12, x13, [z, #64+16]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [z, #128+16]
-
-        ldp     x10, x11, [z, #128+32]
-        ldp     x12, x13, [z, #64+32]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [z, #128+32]
-
-        ldp     x10, x11, [z, #128+48]
-        ldp     x12, x13, [z, #64+48]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [z, #128+48]
-
-        ldp     x10, x11, [z, #128+64]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [z, #128+64]
-
-        ldp     x10, x11, [z, #128+80]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [z, #128+80]
-
-        ldp     x10, x11, [z, #128+96]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [z, #128+96]
-
-        ldp     x10, x11, [z, #128+112]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [z, #128+112]
-
-// Compute M = |x_lo - x_hi| * |y_hi - y_lo| in [t+8...], size 16
-
-        add     x0, t, #64
-        mov     x1, t
-        bl      bignum_ksqr_16_32_neon_local_sqr_8_16
-
-// Add the interlocking H' and L_bot terms, storing in registers x15..x0
-// Intercept the carry at the 8 + 16 = 24 position and store it in x.
-// (Note that we no longer need the input x was pointing at.)
-
-        ldp     x0, x1, [z]
-        ldp     x16, x17, [z, #128]
-        adds    x0, x0, x16
-        adcs    x1, x1, x17
-        ldp     x2, x3, [z, #16]
-        ldp     x16, x17, [z, #144]
-        adcs    x2, x2, x16
-        adcs    x3, x3, x17
-        ldp     x4, x5, [z, #32]
-        ldp     x16, x17, [z, #160]
-        adcs    x4, x4, x16
-        adcs    x5, x5, x17
-        ldp     x6, x7, [z, #48]
-        ldp     x16, x17, [z, #176]
-        adcs    x6, x6, x16
-        adcs    x7, x7, x17
-        ldp     x8, x9, [z, #128]
-        ldp     x16, x17, [z, #192]
-        adcs    x8, x8, x16
-        adcs    x9, x9, x17
-        ldp     x10, x11, [z, #144]
-        ldp     x16, x17, [z, #208]
-        adcs    x10, x10, x16
-        adcs    x11, x11, x17
-        ldp     x12, x13, [z, #160]
-        ldp     x16, x17, [z, #224]
-        adcs    x12, x12, x16
-        adcs    x13, x13, x17
-        ldp     x14, x15, [z, #176]
-        ldp     x16, x17, [z, #240]
-        adcs    x14, x14, x16
-        adcs    x15, x15, x17
-        cset    x, cs
-
-// Subtract the mid-term cross product M
-
-        ldp     x16, x17, [t, #64]
-        subs    x0, x0, x16
-        sbcs    x1, x1, x17
-        stp     x0, x1, [z, #64]
-        ldp     x16, x17, [t, #80]
-        sbcs    x2, x2, x16
-        sbcs    x3, x3, x17
-        stp     x2, x3, [z, #80]
-        ldp     x16, x17, [t, #96]
-        sbcs    x4, x4, x16
-        sbcs    x5, x5, x17
-        stp     x4, x5, [z, #96]
-        ldp     x16, x17, [t, #112]
-        sbcs    x6, x6, x16
-        sbcs    x7, x7, x17
-        stp     x6, x7, [z, #112]
-        ldp     x16, x17, [t, #128]
-        sbcs    x8, x8, x16
-        sbcs    x9, x9, x17
-        stp     x8, x9, [z, #128]
-        ldp     x16, x17, [t, #144]
-        sbcs    x10, x10, x16
-        sbcs    x11, x11, x17
-        stp     x10, x11, [z, #144]
-        ldp     x16, x17, [t, #160]
-        sbcs    x12, x12, x16
-        sbcs    x13, x13, x17
-        stp     x12, x13, [z, #160]
-        ldp     x16, x17, [t, #176]
-        sbcs    x14, x14, x16
-        sbcs    x15, x15, x17
-        stp     x14, x15, [z, #176]
-
-// Get the next digits effectively resulting so far starting at 24
-
-        sbcs    x, x, xzr
-        csetm   t, cc
-
-// Now the final 8 digits of padding; the first one is special in using x
-// and also in getting the carry chain started
-
-        ldp     x10, x11, [z, #192]
-        adds    x10, x10, x
-        adcs    x11, x11, t
-        stp     x10, x11, [z, #192]
-        ldp     x10, x11, [z, #208]
-        adcs    x10, x10, t
-        adcs    x11, x11, t
-        stp     x10, x11, [z, #208]
-        ldp     x10, x11, [z, #224]
-        adcs    x10, x10, t
-        adcs    x11, x11, t
-        stp     x10, x11, [z, #224]
-        ldp     x10, x11, [z, #240]
-        adcs    x10, x10, t
-        adcs    x11, x11, t
-        stp     x10, x11, [z, #240]
-
-// Restore registers and return
-
-        ldp     x25, x30, [sp], #16
-        ldp     x23, x24, [sp], #16
-        ldp     x21, x22, [sp], #16
-        ldp     x19, x20, [sp], #16
-
-        ret
-
-// -----------------------------------------------------------------------------
-// Local 8x8->16 squaring routine, shared to reduce code size. Effectively
-// the same as bignum_sqr_8_16_neon without the scratch register preservation.
-// -----------------------------------------------------------------------------
-
-bignum_ksqr_16_32_neon_local_sqr_8_16:
-// Load registers.
-        ldp	x2, x3, [x1]
-ldr	q20, [x1]
-        ldp	x4, x5, [x1, #16]
-ldr	q21, [x1, #16]
-        ldp	x6, x7, [x1, #32]
-ldr	q22, [x1, #32]
-        ldp	x8, x9, [x1, #48]
-ldr	q23, [x1, #48]
-movi	v30.2d, #0xffffffff
-
-        mul	x17, x2, x4
-        mul	x14, x3, x5
-
-// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8
-// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3)
-ext	v1.16b, v20.16b, v20.16b, #8
-        umulh	x20, x2, x4
-shrn	v2.2s, v20.2d, #32
-        subs	x21, x2, x3
-zip1	v0.2s, v20.2s, v1.2s
-        cneg	x21, x21, cc  // cc = lo, ul, last
-umull	v5.2d, v2.2s, v2.2s
-        csetm	x11, cc  // cc = lo, ul, last
-umull	v6.2d, v2.2s, v0.2s
-        subs	x12, x5, x4
-umull	v3.2d, v0.2s, v0.2s
-        cneg	x12, x12, cc  // cc = lo, ul, last
-mov	v1.16b, v6.16b
-        mul	x13, x21, x12
-usra	v1.2d, v3.2d, #32
-        umulh	x12, x21, x12
-and	v4.16b, v1.16b, v30.16b
-        cinv	x11, x11, cc  // cc = lo, ul, last
-add	v4.2d, v4.2d, v6.2d
-        eor	x13, x13, x11
-usra	v5.2d, v4.2d, #32
-        eor	x12, x12, x11
-sli	v3.2d, v4.2d, #32
-        adds	x19, x17, x20
-usra	v5.2d, v1.2d, #32
-        adc	x20, x20, xzr
-  // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5)
-  ext	v1.16b, v21.16b, v21.16b, #8
-        umulh	x21, x3, x5
-  shrn	v2.2s, v21.2d, #32
-        adds	x19, x19, x14
-  zip1	v0.2s, v21.2s, v1.2s
-        adcs	x20, x20, x21
-        adc	x21, x21, xzr
-        adds	x20, x20, x14
-        adc	x21, x21, xzr
-        cmn	x11, #0x1
-        adcs	x19, x19, x13
-mov	x13, v3.d[1] // mul     x13, x3, x3
-        adcs	x20, x20, x12
-mov	x14, v5.d[1] // umulh   x14, x3, x3
-        adc	x21, x21, x11
-mov	x12, v3.d[0] // mul     x12, x2, x2
-        adds	x17, x17, x17
-mov	x11, v5.d[0] // umulh   x11, x2, x2
-        adcs	x19, x19, x19
-  umull	v5.2d, v2.2s, v2.2s
-        adcs	x20, x20, x20
-  umull	v6.2d, v2.2s, v0.2s
-        adcs	x21, x21, x21
-  umull	v3.2d, v0.2s, v0.2s
-        adc	x10, xzr, xzr
-  mov	v1.16b, v6.16b
-
-        mul	x15, x2, x3
-  usra	v1.2d, v3.2d, #32
-        umulh	x16, x2, x3
-  and	v4.16b, v1.16b, v30.16b
-        adds	x11, x11, x15
-  add	v4.2d, v4.2d, v6.2d
-        adcs	x13, x13, x16
-  usra	v5.2d, v4.2d, #32
-        adc	x14, x14, xzr
-  sli	v3.2d, v4.2d, #32
-        adds	x11, x11, x15
-  usra	v5.2d, v1.2d, #32
-        adcs	x13, x13, x16
-        adc	x14, x14, xzr
-        stp	x12, x11, [x0]
-  mov	x11, v5.d[0] // umulh   x11, x4, x4
-        adds	x17, x17, x13
-  mov	x13, v3.d[1] // mul     x13, x5, x5
-        adcs	x19, x19, x14
-  mov	x14, v5.d[1] // umulh   x14, x5, x5
-        adcs	x20, x20, xzr
-  mov	x12, v3.d[0] // mul     x12, x4, x4
-        adcs	x21, x21, xzr
-// NEON: prepare muls in the upper half
-ext	v1.16b, v22.16b, v22.16b, #8
-        adc	x10, x10, xzr
-shrn	v2.2s, v22.2d, #32
-        stp	x17, x19, [x0, #16]
-zip1	v0.2s, v22.2s, v1.2s
-        mul	x15, x4, x5
-umull	v5.2d, v2.2s, v2.2s
-        umulh	x16, x4, x5
-umull	v6.2d, v2.2s, v0.2s
-        adds	x11, x11, x15
-umull	v3.2d, v0.2s, v0.2s
-        adcs	x13, x13, x16
-mov	v1.16b, v6.16b
-        adc	x14, x14, xzr
-usra	v1.2d, v3.2d, #32
-        adds	x11, x11, x15
-and	v4.16b, v1.16b, v30.16b
-        adcs	x13, x13, x16
-add	v4.2d, v4.2d, v6.2d
-        adc	x14, x14, xzr
-usra	v5.2d, v4.2d, #32
-        adds	x12, x12, x20
-sli	v3.2d, v4.2d, #32
-        adcs	x11, x11, x21
-usra	v5.2d, v1.2d, #32
-        stp	x12, x11, [x0, #32]
-  // NEON: prepare muls in the upper half
-  ext	v1.16b, v23.16b, v23.16b, #8
-        adcs	x13, x13, x10
-  shrn	v2.2s, v23.2d, #32
-        adc	x14, x14, xzr
-  zip1	v0.2s, v23.2s, v1.2s
-        stp	x13, x14, [x0, #48]
-
-// Scalar: square the upper half with a slight variant of the previous block
-        mul	x17, x6, x8
-  umull	v16.2d, v2.2s, v2.2s
-        mul	x14, x7, x9
-  umull	v6.2d, v2.2s, v0.2s
-        umulh	x20, x6, x8
-  umull	v18.2d, v0.2s, v0.2s
-        subs	x21, x6, x7
-        cneg	x21, x21, cc  // cc = lo, ul, last
-  mov	v1.16b, v6.16b
-        csetm	x11, cc  // cc = lo, ul, last
-        subs	x12, x9, x8
-        cneg	x12, x12, cc  // cc = lo, ul, last
-  usra	v1.2d, v18.2d, #32
-        mul	x13, x21, x12
-  and	v4.16b, v1.16b, v30.16b
-        umulh	x12, x21, x12
-  add	v4.2d, v4.2d, v6.2d
-        cinv	x11, x11, cc  // cc = lo, ul, last
-        eor	x13, x13, x11
-        eor	x12, x12, x11
-  usra	v16.2d, v4.2d, #32
-        adds	x19, x17, x20
-        adc	x20, x20, xzr
-  sli	v18.2d, v4.2d, #32
-        umulh	x21, x7, x9
-        adds	x19, x19, x14
-        adcs	x20, x20, x21
-        adc	x21, x21, xzr
-        adds	x20, x20, x14
-mov	x14, v5.d[1]
-        adc	x21, x21, xzr
-        cmn	x11, #0x1
-        adcs	x19, x19, x13
-mov	x13, v3.d[1]
-        adcs	x20, x20, x12
-mov	x12, v3.d[0]
-        adc	x21, x21, x11
-mov	x11, v5.d[0]
-        adds	x17, x17, x17
-        adcs	x19, x19, x19
-  usra	v16.2d, v1.2d, #32
-        adcs	x20, x20, x20
-        adcs	x21, x21, x21
-        adc	x10, xzr, xzr
-// NEON: two mul+umulhs for the next stage
-uzp2	v17.4s, v21.4s, v23.4s
-        mul	x15, x6, x7
-xtn	v4.2s, v23.2d
-        umulh	x16, x6, x7
-  mov	x22, v16.d[0]
-        adds	x11, x11, x15
-        adcs	x13, x13, x16
-xtn	v5.2s, v21.2d
-        adc	x14, x14, xzr
-        adds	x11, x11, x15
-rev64	v1.4s, v21.4s
-        adcs	x13, x13, x16
-        adc	x14, x14, xzr
-        stp	x12, x11, [x0, #64]
-        adds	x17, x17, x13
-  mov	x13, v18.d[1]
-        adcs	x19, x19, x14
-  mov	x14, v16.d[1]
-        adcs	x20, x20, xzr
-  mov	x12, v18.d[0]
-        adcs	x21, x21, xzr
-        adc	x10, x10, xzr
-umull	v6.2d, v4.2s, v5.2s
-        stp	x17, x19, [x0, #80]
-umull	v7.2d, v4.2s, v17.2s
-        mul	x15, x8, x9
-uzp2	v16.4s, v23.4s, v23.4s
-        umulh	x16, x8, x9
-mul	v0.4s, v1.4s, v23.4s
-        adds	x11, x22, x15
-        adcs	x13, x13, x16
-usra	v7.2d, v6.2d, #32
-        adc	x14, x14, xzr
-        adds	x11, x11, x15
-umull	v1.2d, v16.2s, v17.2s
-        adcs	x13, x13, x16
-        adc	x14, x14, xzr
-uaddlp	v0.2d, v0.4s
-        adds	x12, x12, x20
-        adcs	x11, x11, x21
-and	v2.16b, v7.16b, v30.16b
-umlal	v2.2d, v16.2s, v5.2s
-shl	v0.2d, v0.2d, #32
-usra	v1.2d, v7.2d, #32
-umlal	v0.2d, v4.2s, v5.2s
-mov	x16, v0.d[1]
-mov	x15, v0.d[0]
-usra	v1.2d, v2.2d, #32
-mov	x20, v1.d[0]
-mov	x21, v1.d[1]
-        stp	x12, x11, [x0, #96]
-        adcs	x13, x13, x10
-        adc	x14, x14, xzr
-        stp	x13, x14, [x0, #112]
-
-// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0]
-
-        mul	x10, x2, x6
-        mul	x14, x3, x7
-        umulh	x17, x2, x6
-        adds	x14, x14, x17
-        umulh	x17, x3, x7
-        adcs	x15, x15, x17
-        adcs	x16, x16, x20
-        adc	x17, x21, xzr
-        adds	x11, x14, x10
-        adcs	x14, x15, x14
-        adcs	x15, x16, x15
-        adcs	x16, x17, x16
-        adc	x17, xzr, x17
-        adds	x12, x14, x10
-        adcs	x13, x15, x11
-        adcs	x14, x16, x14
-        adcs	x15, x17, x15
-        adcs	x16, xzr, x16
-        adc	x17, xzr, x17
-        subs	x22, x4, x5
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x9, x8
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x15, x15, x21
-        eor	x20, x20, x19
-        adcs	x16, x16, x20
-        adc	x17, x17, x19
-        subs	x22, x2, x3
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x7, x6
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x11, x11, x21
-        eor	x20, x20, x19
-        adcs	x12, x12, x20
-        adcs	x13, x13, x19
-        adcs	x14, x14, x19
-        adcs	x15, x15, x19
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        subs	x22, x3, x5
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x9, x7
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x14, x14, x21
-        eor	x20, x20, x19
-        adcs	x15, x15, x20
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        subs	x22, x2, x4
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x8, x6
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x12, x12, x21
-        eor	x20, x20, x19
-        adcs	x13, x13, x20
-        adcs	x14, x14, x19
-        adcs	x15, x15, x19
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        subs	x22, x2, x5
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x9, x6
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x13, x13, x21
-        eor	x20, x20, x19
-        adcs	x14, x14, x20
-        adcs	x15, x15, x19
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        subs	x22, x3, x4
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x8, x7
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x13, x13, x21
-        eor	x20, x20, x19
-        adcs	x14, x14, x20
-        adcs	x15, x15, x19
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        adds	x10, x10, x10
-        adcs	x11, x11, x11
-        adcs	x12, x12, x12
-        adcs	x13, x13, x13
-        adcs	x14, x14, x14
-        adcs	x15, x15, x15
-        adcs	x16, x16, x16
-        adcs	x17, x17, x17
-        adc	x19, xzr, xzr
-
-// Add it back to the buffer
-
-        ldp	x2, x3, [x0, #32]
-        adds	x10, x10, x2
-        adcs	x11, x11, x3
-        stp	x10, x11, [x0, #32]
-
-        ldp	x2, x3, [x0, #48]
-        adcs	x12, x12, x2
-        adcs	x13, x13, x3
-        stp	x12, x13, [x0, #48]
-
-        ldp	x2, x3, [x0, #64]
-        adcs	x14, x14, x2
-        adcs	x15, x15, x3
-        stp	x14, x15, [x0, #64]
-
-        ldp	x2, x3, [x0, #80]
-        adcs	x16, x16, x2
-        adcs	x17, x17, x3
-        stp	x16, x17, [x0, #80]
-
-        ldp	x2, x3, [x0, #96]
-        adcs	x2, x2, x19
-        adcs	x3, x3, xzr
-        stp	x2, x3, [x0, #96]
-
-        ldp	x2, x3, [x0, #112]
-        adcs	x2, x2, xzr
-        adc	x3, x3, xzr
-        stp	x2, x3, [x0, #112]
-
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64_neon.S b/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64_neon.S
deleted file mode 100644
index 04197642339..00000000000
--- a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64_neon.S
+++ /dev/null
@@ -1,1075 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Square, z := x^2
-// Input x[32]; output z[64]; temporary buffer t[>=72]
-//
-//    extern void bignum_ksqr_32_64_neon
-//     (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]);
-//
-// This is a Karatsuba-style function squaring half-sized results
-// and using temporary buffer t for intermediate results.
-//
-// Standard ARM ABI: X0 = z, X1 = x, X2 = t
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_32_64_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_32_64_neon)
-        .text
-        .balign 4
-
-#define K 16
-#define L 8 // (K/2)
-
-#define z x19
-#define x x20
-#define t x21
-
-#define c x16
-
-
-S2N_BN_SYMBOL(bignum_ksqr_32_64_neon):
-
-// Save extra registers and return address, store parameters safely
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x30, [sp, #-16]!
-
-        mov     z, x0
-        mov     x, x1
-        mov     t, x2
-
-// Compute L = x_lo * y_lo in bottom half of buffer (size 16 x 16 -> 32)
-
-        bl      bignum_ksqr_32_64_neon_local_ksqr_16_32
-
-// Compute H = x_hi * y_hi in top half of buffer (size 16 x 16 -> 32)
-
-        add     x0, z, #16*K
-        add     x1, x, #8*K
-        mov     x2, t
-        bl      bignum_ksqr_32_64_neon_local_ksqr_16_32
-
-// Compute H' = H + L_top in place of H (it cannot overflow)
-
-        ldp     x0, x1, [z, #16*16]
-        ldp     x2, x3, [z, #16*8]
-        adds    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*16]
-
-        ldp     x0, x1, [z, #16*17]
-        ldp     x2, x3, [z, #16*9]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*17]
-
-        ldp     x0, x1, [z, #16*18]
-        ldp     x2, x3, [z, #16*10]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*18]
-
-        ldp     x0, x1, [z, #16*19]
-        ldp     x2, x3, [z, #16*11]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*19]
-
-        ldp     x0, x1, [z, #16*20]
-        ldp     x2, x3, [z, #16*12]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*20]
-
-        ldp     x0, x1, [z, #16*21]
-        ldp     x2, x3, [z, #16*13]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*21]
-
-        ldp     x0, x1, [z, #16*22]
-        ldp     x2, x3, [z, #16*14]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*22]
-
-        ldp     x0, x1, [z, #16*23]
-        ldp     x2, x3, [z, #16*15]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*23]
-
-        ldp     x0, x1, [z, #16*24]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*24]
-
-        ldp     x0, x1, [z, #16*25]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*25]
-
-        ldp     x0, x1, [z, #16*26]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*26]
-
-        ldp     x0, x1, [z, #16*27]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*27]
-
-        ldp     x0, x1, [z, #16*28]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*28]
-
-        ldp     x0, x1, [z, #16*29]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*29]
-
-        ldp     x0, x1, [z, #16*30]
-        adcs    x0, x0, xzr
-        adcs    x1, x1, xzr
-        stp     x0, x1, [z, #16*30]
-
-        ldp     x0, x1, [z, #16*31]
-        adcs    x0, x0, xzr
-        adc     x1, x1, xzr
-        stp     x0, x1, [z, #16*31]
-
-// Compute absolute difference [t..] = |x_lo - x_hi|
-
-        ldp     x0, x1, [x, #128]
-        ldp     x16, x17, [x]
-        subs    x0, x0, x16
-        sbcs    x1, x1, x17
-
-        ldp     x2, x3, [x, #144]
-        ldp     x16, x17, [x, #16]
-        sbcs    x2, x2, x16
-        sbcs    x3, x3, x17
-
-        ldp     x4, x5, [x, #160]
-        ldp     x16, x17, [x, #32]
-        sbcs    x4, x4, x16
-        sbcs    x5, x5, x17
-
-        ldp     x6, x7, [x, #176]
-        ldp     x16, x17, [x, #48]
-        sbcs    x6, x6, x16
-        sbcs    x7, x7, x17
-
-        ldp     x8, x9, [x, #192]
-        ldp     x16, x17, [x, #64]
-        sbcs    x8, x8, x16
-        sbcs    x9, x9, x17
-
-        ldp     x10, x11, [x, #208]
-        ldp     x16, x17, [x, #80]
-        sbcs    x10, x10, x16
-        sbcs    x11, x11, x17
-
-        ldp     x12, x13, [x, #224]
-        ldp     x16, x17, [x, #96]
-        sbcs    x12, x12, x16
-        sbcs    x13, x13, x17
-
-        ldp     x14, x15, [x, #240]
-        ldp     x16, x17, [x, #112]
-        sbcs    x14, x14, x16
-        sbcs    x15, x15, x17
-
-        sbc     c, xzr, xzr
-
-        adds    xzr, c, c
-
-        eor     x0, x0, c
-        adcs    x0, x0, xzr
-        eor     x1, x1, c
-        adcs    x1, x1, xzr
-        stp     x0, x1, [t]
-
-        eor     x2, x2, c
-        adcs    x2, x2, xzr
-        eor     x3, x3, c
-        adcs    x3, x3, xzr
-        stp     x2, x3, [t, #16]
-
-        eor     x4, x4, c
-        adcs    x4, x4, xzr
-        eor     x5, x5, c
-        adcs    x5, x5, xzr
-        stp     x4, x5, [t, #32]
-
-        eor     x6, x6, c
-        adcs    x6, x6, xzr
-        eor     x7, x7, c
-        adcs    x7, x7, xzr
-        stp     x6, x7, [t, #48]
-
-        eor     x8, x8, c
-        adcs    x8, x8, xzr
-        eor     x9, x9, c
-        adcs    x9, x9, xzr
-        stp     x8, x9, [t, #64]
-
-        eor     x10, x10, c
-        adcs    x10, x10, xzr
-        eor     x11, x11, c
-        adcs    x11, x11, xzr
-        stp     x10, x11, [t, #80]
-
-        eor     x12, x12, c
-        adcs    x12, x12, xzr
-        eor     x13, x13, c
-        adcs    x13, x13, xzr
-        stp     x12, x13, [t, #96]
-
-        eor     x14, x14, c
-        adcs    x14, x14, xzr
-        eor     x15, x15, c
-        adc     x15, x15, xzr
-        stp     x14, x15, [t, #112]
-
-// Compute M = |x_lo - x_hi|^2, size 32
-
-        add     x0, t, #8*K
-        mov     x1, t
-        add     x2, t, #24*K
-        bl      bignum_ksqr_32_64_neon_local_ksqr_16_32
-
-// Add the interlocking H' and L_bot terms
-// Intercept the carry at the 3k position and store it in x.
-// (Note that we no longer need the input x was pointing at.)
-
-        ldp     x0, x1, [z, #16*16]
-        ldp     x2, x3, [z]
-        adds    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*8]
-
-        ldp     x0, x1, [z, #16*17]
-        ldp     x2, x3, [z, #16*1]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*9]
-
-        ldp     x0, x1, [z, #16*18]
-        ldp     x2, x3, [z, #16*2]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*10]
-
-        ldp     x0, x1, [z, #16*19]
-        ldp     x2, x3, [z, #16*3]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*11]
-
-        ldp     x0, x1, [z, #16*20]
-        ldp     x2, x3, [z, #16*4]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*12]
-
-        ldp     x0, x1, [z, #16*21]
-        ldp     x2, x3, [z, #16*5]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*13]
-
-        ldp     x0, x1, [z, #16*22]
-        ldp     x2, x3, [z, #16*6]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*14]
-
-        ldp     x0, x1, [z, #16*23]
-        ldp     x2, x3, [z, #16*7]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*15]
-
-        ldp     x0, x1, [z, #16*16]
-        ldp     x2, x3, [z, #16*24]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*16]
-
-        ldp     x0, x1, [z, #16*17]
-        ldp     x2, x3, [z, #16*25]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*17]
-
-        ldp     x0, x1, [z, #16*18]
-        ldp     x2, x3, [z, #16*26]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*18]
-
-        ldp     x0, x1, [z, #16*19]
-        ldp     x2, x3, [z, #16*27]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*19]
-
-        ldp     x0, x1, [z, #16*20]
-        ldp     x2, x3, [z, #16*28]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*20]
-
-        ldp     x0, x1, [z, #16*21]
-        ldp     x2, x3, [z, #16*29]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*21]
-
-        ldp     x0, x1, [z, #16*22]
-        ldp     x2, x3, [z, #16*30]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*22]
-
-        ldp     x0, x1, [z, #16*23]
-        ldp     x2, x3, [z, #16*31]
-        adcs    x0, x0, x2
-        adcs    x1, x1, x3
-        stp     x0, x1, [z, #16*23]
-
-        cset      x, cs
-
-// Subtract the mid-term cross product M
-
-        ldp     x0, x1, [z, #16*L]
-        ldp     x2, x3, [t, #16*L]
-        subs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*L]
-
-        ldp     x0, x1, [z, #16*9]
-        ldp     x2, x3, [t, #16*9]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*9]
-
-        ldp     x0, x1, [z, #16*10]
-        ldp     x2, x3, [t, #16*10]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*10]
-
-        ldp     x0, x1, [z, #16*11]
-        ldp     x2, x3, [t, #16*11]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*11]
-
-        ldp     x0, x1, [z, #16*12]
-        ldp     x2, x3, [t, #16*12]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*12]
-
-        ldp     x0, x1, [z, #16*13]
-        ldp     x2, x3, [t, #16*13]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*13]
-
-        ldp     x0, x1, [z, #16*14]
-        ldp     x2, x3, [t, #16*14]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*14]
-
-        ldp     x0, x1, [z, #16*15]
-        ldp     x2, x3, [t, #16*15]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*15]
-
-        ldp     x0, x1, [z, #16*16]
-        ldp     x2, x3, [t, #16*16]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*16]
-
-        ldp     x0, x1, [z, #16*17]
-        ldp     x2, x3, [t, #16*17]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*17]
-
-        ldp     x0, x1, [z, #16*18]
-        ldp     x2, x3, [t, #16*18]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*18]
-
-        ldp     x0, x1, [z, #16*19]
-        ldp     x2, x3, [t, #16*19]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*19]
-
-        ldp     x0, x1, [z, #16*20]
-        ldp     x2, x3, [t, #16*20]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*20]
-
-        ldp     x0, x1, [z, #16*21]
-        ldp     x2, x3, [t, #16*21]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*21]
-
-        ldp     x0, x1, [z, #16*22]
-        ldp     x2, x3, [t, #16*22]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*22]
-
-        ldp     x0, x1, [z, #16*23]
-        ldp     x2, x3, [t, #16*23]
-        sbcs    x0, x0, x2
-        sbcs    x1, x1, x3
-        stp     x0, x1, [z, #16*23]
-
-// Get the next digits effectively resulting so far starting at 3k
-// [...,c,c,c,c,x]
-
-        sbcs    x, x, xzr
-        csetm   c, cc
-
-// Now propagate through the top quarter of the result
-
-        ldp     x0, x1, [z, #16*24]
-        adds    x0, x0, x
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*24]
-
-        ldp     x0, x1, [z, #16*25]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*25]
-
-        ldp     x0, x1, [z, #16*26]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*26]
-
-        ldp     x0, x1, [z, #16*27]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*27]
-
-        ldp     x0, x1, [z, #16*28]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*28]
-
-        ldp     x0, x1, [z, #16*29]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*29]
-
-        ldp     x0, x1, [z, #16*30]
-        adcs    x0, x0, c
-        adcs    x1, x1, c
-        stp     x0, x1, [z, #16*30]
-
-        ldp     x0, x1, [z, #16*31]
-        adcs    x0, x0, c
-        adc     x1, x1, c
-        stp     x0, x1, [z, #16*31]
-
-// Restore
-
-        ldp     x21, x30, [sp], #16
-        ldp     x19, x20, [sp], #16
-
-        ret
-
-// Local copy of bignum_ksqr_16_32, identical to main one.
-// This includes in turn a copy of bignum_sqr_8_16.
-
-bignum_ksqr_32_64_neon_local_ksqr_16_32:
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x30, [sp, #-16]!
-        mov     x23, x0
-        mov     x24, x1
-        mov     x25, x2
-        bl      bignum_ksqr_32_64_neon_local_sqr_8_16
-        ldp     x10, x11, [x24]
-        ldp     x8, x9, [x24, #64]
-        subs    x10, x10, x8
-        sbcs    x11, x11, x9
-        ldp     x12, x13, [x24, #16]
-        ldp     x8, x9, [x24, #80]
-        sbcs    x12, x12, x8
-        sbcs    x13, x13, x9
-        ldp     x14, x15, [x24, #32]
-        ldp     x8, x9, [x24, #96]
-        sbcs    x14, x14, x8
-        sbcs    x15, x15, x9
-        ldp     x16, x17, [x24, #48]
-        ldp     x8, x9, [x24, #112]
-        sbcs    x16, x16, x8
-        sbcs    x17, x17, x9
-        csetm   x19, cc
-        cmn     x19, x19
-        eor     x10, x10, x19
-        adcs    x10, x10, xzr
-        eor     x11, x11, x19
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x25]
-        eor     x12, x12, x19
-        adcs    x12, x12, xzr
-        eor     x13, x13, x19
-        adcs    x13, x13, xzr
-        stp     x12, x13, [x25, #16]
-        eor     x14, x14, x19
-        adcs    x14, x14, xzr
-        eor     x15, x15, x19
-        adcs    x15, x15, xzr
-        stp     x14, x15, [x25, #32]
-        eor     x16, x16, x19
-        adcs    x16, x16, xzr
-        eor     x17, x17, x19
-        adcs    x17, x17, xzr
-        stp     x16, x17, [x25, #48]
-        add     x0, x23, #0x80
-        add     x1, x24, #0x40
-        bl      bignum_ksqr_32_64_neon_local_sqr_8_16
-        ldp     x10, x11, [x23, #128]
-        ldp     x12, x13, [x23, #64]
-        adds    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [x23, #128]
-        ldp     x10, x11, [x23, #144]
-        ldp     x12, x13, [x23, #80]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [x23, #144]
-        ldp     x10, x11, [x23, #160]
-        ldp     x12, x13, [x23, #96]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [x23, #160]
-        ldp     x10, x11, [x23, #176]
-        ldp     x12, x13, [x23, #112]
-        adcs    x10, x10, x12
-        adcs    x11, x11, x13
-        stp     x10, x11, [x23, #176]
-        ldp     x10, x11, [x23, #192]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x23, #192]
-        ldp     x10, x11, [x23, #208]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x23, #208]
-        ldp     x10, x11, [x23, #224]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x23, #224]
-        ldp     x10, x11, [x23, #240]
-        adcs    x10, x10, xzr
-        adcs    x11, x11, xzr
-        stp     x10, x11, [x23, #240]
-        add     x0, x25, #0x40
-        mov     x1, x25
-        bl      bignum_ksqr_32_64_neon_local_sqr_8_16
-        ldp     x0, x1, [x23]
-        ldp     x16, x17, [x23, #128]
-        adds    x0, x0, x16
-        adcs    x1, x1, x17
-        ldp     x2, x3, [x23, #16]
-        ldp     x16, x17, [x23, #144]
-        adcs    x2, x2, x16
-        adcs    x3, x3, x17
-        ldp     x4, x5, [x23, #32]
-        ldp     x16, x17, [x23, #160]
-        adcs    x4, x4, x16
-        adcs    x5, x5, x17
-        ldp     x6, x7, [x23, #48]
-        ldp     x16, x17, [x23, #176]
-        adcs    x6, x6, x16
-        adcs    x7, x7, x17
-        ldp     x8, x9, [x23, #128]
-        ldp     x16, x17, [x23, #192]
-        adcs    x8, x8, x16
-        adcs    x9, x9, x17
-        ldp     x10, x11, [x23, #144]
-        ldp     x16, x17, [x23, #208]
-        adcs    x10, x10, x16
-        adcs    x11, x11, x17
-        ldp     x12, x13, [x23, #160]
-        ldp     x16, x17, [x23, #224]
-        adcs    x12, x12, x16
-        adcs    x13, x13, x17
-        ldp     x14, x15, [x23, #176]
-        ldp     x16, x17, [x23, #240]
-        adcs    x14, x14, x16
-        adcs    x15, x15, x17
-        cset    x24, cs
-        ldp     x16, x17, [x25, #64]
-        subs    x0, x0, x16
-        sbcs    x1, x1, x17
-        stp     x0, x1, [x23, #64]
-        ldp     x16, x17, [x25, #80]
-        sbcs    x2, x2, x16
-        sbcs    x3, x3, x17
-        stp     x2, x3, [x23, #80]
-        ldp     x16, x17, [x25, #96]
-        sbcs    x4, x4, x16
-        sbcs    x5, x5, x17
-        stp     x4, x5, [x23, #96]
-        ldp     x16, x17, [x25, #112]
-        sbcs    x6, x6, x16
-        sbcs    x7, x7, x17
-        stp     x6, x7, [x23, #112]
-        ldp     x16, x17, [x25, #128]
-        sbcs    x8, x8, x16
-        sbcs    x9, x9, x17
-        stp     x8, x9, [x23, #128]
-        ldp     x16, x17, [x25, #144]
-        sbcs    x10, x10, x16
-        sbcs    x11, x11, x17
-        stp     x10, x11, [x23, #144]
-        ldp     x16, x17, [x25, #160]
-        sbcs    x12, x12, x16
-        sbcs    x13, x13, x17
-        stp     x12, x13, [x23, #160]
-        ldp     x16, x17, [x25, #176]
-        sbcs    x14, x14, x16
-        sbcs    x15, x15, x17
-        stp     x14, x15, [x23, #176]
-        sbcs    x24, x24, xzr
-        csetm   x25, cc
-        ldp     x10, x11, [x23, #192]
-        adds    x10, x10, x24
-        adcs    x11, x11, x25
-        stp     x10, x11, [x23, #192]
-        ldp     x10, x11, [x23, #208]
-        adcs    x10, x10, x25
-        adcs    x11, x11, x25
-        stp     x10, x11, [x23, #208]
-        ldp     x10, x11, [x23, #224]
-        adcs    x10, x10, x25
-        adcs    x11, x11, x25
-        stp     x10, x11, [x23, #224]
-        ldp     x10, x11, [x23, #240]
-        adcs    x10, x10, x25
-        adcs    x11, x11, x25
-        stp     x10, x11, [x23, #240]
-        ldp     x25, x30, [sp], #16
-        ldp     x23, x24, [sp], #16
-        ldp     x21, x22, [sp], #16
-        ldp     x19, x20, [sp], #16
-        ret
-
-bignum_ksqr_32_64_neon_local_sqr_8_16:
-// Load registers.
-        ldp	x2, x3, [x1]
-ldr	q20, [x1]
-        ldp	x4, x5, [x1, #16]
-ldr	q21, [x1, #16]
-        ldp	x6, x7, [x1, #32]
-ldr	q22, [x1, #32]
-        ldp	x8, x9, [x1, #48]
-ldr	q23, [x1, #48]
-movi	v30.2d, #0xffffffff
-
-        mul	x17, x2, x4
-        mul	x14, x3, x5
-
-// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8
-// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3)
-ext	v1.16b, v20.16b, v20.16b, #8
-        umulh	x20, x2, x4
-shrn	v2.2s, v20.2d, #32
-        subs	x21, x2, x3
-zip1	v0.2s, v20.2s, v1.2s
-        cneg	x21, x21, cc  // cc = lo, ul, last
-umull	v5.2d, v2.2s, v2.2s
-        csetm	x11, cc  // cc = lo, ul, last
-umull	v6.2d, v2.2s, v0.2s
-        subs	x12, x5, x4
-umull	v3.2d, v0.2s, v0.2s
-        cneg	x12, x12, cc  // cc = lo, ul, last
-mov	v1.16b, v6.16b
-        mul	x13, x21, x12
-usra	v1.2d, v3.2d, #32
-        umulh	x12, x21, x12
-and	v4.16b, v1.16b, v30.16b
-        cinv	x11, x11, cc  // cc = lo, ul, last
-add	v4.2d, v4.2d, v6.2d
-        eor	x13, x13, x11
-usra	v5.2d, v4.2d, #32
-        eor	x12, x12, x11
-sli	v3.2d, v4.2d, #32
-        adds	x19, x17, x20
-usra	v5.2d, v1.2d, #32
-        adc	x20, x20, xzr
-  // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5)
-  ext	v1.16b, v21.16b, v21.16b, #8
-        umulh	x21, x3, x5
-  shrn	v2.2s, v21.2d, #32
-        adds	x19, x19, x14
-  zip1	v0.2s, v21.2s, v1.2s
-        adcs	x20, x20, x21
-        adc	x21, x21, xzr
-        adds	x20, x20, x14
-        adc	x21, x21, xzr
-        cmn	x11, #0x1
-        adcs	x19, x19, x13
-mov	x13, v3.d[1] // mul     x13, x3, x3
-        adcs	x20, x20, x12
-mov	x14, v5.d[1] // umulh   x14, x3, x3
-        adc	x21, x21, x11
-mov	x12, v3.d[0] // mul     x12, x2, x2
-        adds	x17, x17, x17
-mov	x11, v5.d[0] // umulh   x11, x2, x2
-        adcs	x19, x19, x19
-  umull	v5.2d, v2.2s, v2.2s
-        adcs	x20, x20, x20
-  umull	v6.2d, v2.2s, v0.2s
-        adcs	x21, x21, x21
-  umull	v3.2d, v0.2s, v0.2s
-        adc	x10, xzr, xzr
-  mov	v1.16b, v6.16b
-
-        mul	x15, x2, x3
-  usra	v1.2d, v3.2d, #32
-        umulh	x16, x2, x3
-  and	v4.16b, v1.16b, v30.16b
-        adds	x11, x11, x15
-  add	v4.2d, v4.2d, v6.2d
-        adcs	x13, x13, x16
-  usra	v5.2d, v4.2d, #32
-        adc	x14, x14, xzr
-  sli	v3.2d, v4.2d, #32
-        adds	x11, x11, x15
-  usra	v5.2d, v1.2d, #32
-        adcs	x13, x13, x16
-        adc	x14, x14, xzr
-        stp	x12, x11, [x0]
-  mov	x11, v5.d[0] // umulh   x11, x4, x4
-        adds	x17, x17, x13
-  mov	x13, v3.d[1] // mul     x13, x5, x5
-        adcs	x19, x19, x14
-  mov	x14, v5.d[1] // umulh   x14, x5, x5
-        adcs	x20, x20, xzr
-  mov	x12, v3.d[0] // mul     x12, x4, x4
-        adcs	x21, x21, xzr
-// NEON: prepare muls in the upper half
-ext	v1.16b, v22.16b, v22.16b, #8
-        adc	x10, x10, xzr
-shrn	v2.2s, v22.2d, #32
-        stp	x17, x19, [x0, #16]
-zip1	v0.2s, v22.2s, v1.2s
-        mul	x15, x4, x5
-umull	v5.2d, v2.2s, v2.2s
-        umulh	x16, x4, x5
-umull	v6.2d, v2.2s, v0.2s
-        adds	x11, x11, x15
-umull	v3.2d, v0.2s, v0.2s
-        adcs	x13, x13, x16
-mov	v1.16b, v6.16b
-        adc	x14, x14, xzr
-usra	v1.2d, v3.2d, #32
-        adds	x11, x11, x15
-and	v4.16b, v1.16b, v30.16b
-        adcs	x13, x13, x16
-add	v4.2d, v4.2d, v6.2d
-        adc	x14, x14, xzr
-usra	v5.2d, v4.2d, #32
-        adds	x12, x12, x20
-sli	v3.2d, v4.2d, #32
-        adcs	x11, x11, x21
-usra	v5.2d, v1.2d, #32
-        stp	x12, x11, [x0, #32]
-  // NEON: prepare muls in the upper half
-  ext	v1.16b, v23.16b, v23.16b, #8
-        adcs	x13, x13, x10
-  shrn	v2.2s, v23.2d, #32
-        adc	x14, x14, xzr
-  zip1	v0.2s, v23.2s, v1.2s
-        stp	x13, x14, [x0, #48]
-
-// Scalar: square the upper half with a slight variant of the previous block
-        mul	x17, x6, x8
-  umull	v16.2d, v2.2s, v2.2s
-        mul	x14, x7, x9
-  umull	v6.2d, v2.2s, v0.2s
-        umulh	x20, x6, x8
-  umull	v18.2d, v0.2s, v0.2s
-        subs	x21, x6, x7
-        cneg	x21, x21, cc  // cc = lo, ul, last
-  mov	v1.16b, v6.16b
-        csetm	x11, cc  // cc = lo, ul, last
-        subs	x12, x9, x8
-        cneg	x12, x12, cc  // cc = lo, ul, last
-  usra	v1.2d, v18.2d, #32
-        mul	x13, x21, x12
-  and	v4.16b, v1.16b, v30.16b
-        umulh	x12, x21, x12
-  add	v4.2d, v4.2d, v6.2d
-        cinv	x11, x11, cc  // cc = lo, ul, last
-        eor	x13, x13, x11
-        eor	x12, x12, x11
-  usra	v16.2d, v4.2d, #32
-        adds	x19, x17, x20
-        adc	x20, x20, xzr
-  sli	v18.2d, v4.2d, #32
-        umulh	x21, x7, x9
-        adds	x19, x19, x14
-        adcs	x20, x20, x21
-        adc	x21, x21, xzr
-        adds	x20, x20, x14
-mov	x14, v5.d[1]
-        adc	x21, x21, xzr
-        cmn	x11, #0x1
-        adcs	x19, x19, x13
-mov	x13, v3.d[1]
-        adcs	x20, x20, x12
-mov	x12, v3.d[0]
-        adc	x21, x21, x11
-mov	x11, v5.d[0]
-        adds	x17, x17, x17
-        adcs	x19, x19, x19
-  usra	v16.2d, v1.2d, #32
-        adcs	x20, x20, x20
-        adcs	x21, x21, x21
-        adc	x10, xzr, xzr
-// NEON: two mul+umulhs for the next stage
-uzp2	v17.4s, v21.4s, v23.4s
-        mul	x15, x6, x7
-xtn	v4.2s, v23.2d
-        umulh	x16, x6, x7
-  mov	x22, v16.d[0]
-        adds	x11, x11, x15
-        adcs	x13, x13, x16
-xtn	v5.2s, v21.2d
-        adc	x14, x14, xzr
-        adds	x11, x11, x15
-rev64	v1.4s, v21.4s
-        adcs	x13, x13, x16
-        adc	x14, x14, xzr
-        stp	x12, x11, [x0, #64]
-        adds	x17, x17, x13
-  mov	x13, v18.d[1]
-        adcs	x19, x19, x14
-  mov	x14, v16.d[1]
-        adcs	x20, x20, xzr
-  mov	x12, v18.d[0]
-        adcs	x21, x21, xzr
-        adc	x10, x10, xzr
-umull	v6.2d, v4.2s, v5.2s
-        stp	x17, x19, [x0, #80]
-umull	v7.2d, v4.2s, v17.2s
-        mul	x15, x8, x9
-uzp2	v16.4s, v23.4s, v23.4s
-        umulh	x16, x8, x9
-mul	v0.4s, v1.4s, v23.4s
-        adds	x11, x22, x15
-        adcs	x13, x13, x16
-usra	v7.2d, v6.2d, #32
-        adc	x14, x14, xzr
-        adds	x11, x11, x15
-umull	v1.2d, v16.2s, v17.2s
-        adcs	x13, x13, x16
-        adc	x14, x14, xzr
-uaddlp	v0.2d, v0.4s
-        adds	x12, x12, x20
-        adcs	x11, x11, x21
-and	v2.16b, v7.16b, v30.16b
-umlal	v2.2d, v16.2s, v5.2s
-shl	v0.2d, v0.2d, #32
-usra	v1.2d, v7.2d, #32
-umlal	v0.2d, v4.2s, v5.2s
-mov	x16, v0.d[1]
-mov	x15, v0.d[0]
-usra	v1.2d, v2.2d, #32
-mov	x20, v1.d[0]
-mov	x21, v1.d[1]
-        stp	x12, x11, [x0, #96]
-        adcs	x13, x13, x10
-        adc	x14, x14, xzr
-        stp	x13, x14, [x0, #112]
-
-// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0]
-
-        mul	x10, x2, x6
-        mul	x14, x3, x7
-        umulh	x17, x2, x6
-        adds	x14, x14, x17
-        umulh	x17, x3, x7
-        adcs	x15, x15, x17
-        adcs	x16, x16, x20
-        adc	x17, x21, xzr
-        adds	x11, x14, x10
-        adcs	x14, x15, x14
-        adcs	x15, x16, x15
-        adcs	x16, x17, x16
-        adc	x17, xzr, x17
-        adds	x12, x14, x10
-        adcs	x13, x15, x11
-        adcs	x14, x16, x14
-        adcs	x15, x17, x15
-        adcs	x16, xzr, x16
-        adc	x17, xzr, x17
-        subs	x22, x4, x5
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x9, x8
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x15, x15, x21
-        eor	x20, x20, x19
-        adcs	x16, x16, x20
-        adc	x17, x17, x19
-        subs	x22, x2, x3
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x7, x6
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x11, x11, x21
-        eor	x20, x20, x19
-        adcs	x12, x12, x20
-        adcs	x13, x13, x19
-        adcs	x14, x14, x19
-        adcs	x15, x15, x19
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        subs	x22, x3, x5
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x9, x7
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x14, x14, x21
-        eor	x20, x20, x19
-        adcs	x15, x15, x20
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        subs	x22, x2, x4
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x8, x6
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x12, x12, x21
-        eor	x20, x20, x19
-        adcs	x13, x13, x20
-        adcs	x14, x14, x19
-        adcs	x15, x15, x19
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        subs	x22, x2, x5
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x9, x6
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x13, x13, x21
-        eor	x20, x20, x19
-        adcs	x14, x14, x20
-        adcs	x15, x15, x19
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        subs	x22, x3, x4
-        cneg	x22, x22, cc  // cc = lo, ul, last
-        csetm	x19, cc  // cc = lo, ul, last
-        subs	x20, x8, x7
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        mul	x21, x22, x20
-        umulh	x20, x22, x20
-        cinv	x19, x19, cc  // cc = lo, ul, last
-        cmn	x19, #0x1
-        eor	x21, x21, x19
-        adcs	x13, x13, x21
-        eor	x20, x20, x19
-        adcs	x14, x14, x20
-        adcs	x15, x15, x19
-        adcs	x16, x16, x19
-        adc	x17, x17, x19
-        adds	x10, x10, x10
-        adcs	x11, x11, x11
-        adcs	x12, x12, x12
-        adcs	x13, x13, x13
-        adcs	x14, x14, x14
-        adcs	x15, x15, x15
-        adcs	x16, x16, x16
-        adcs	x17, x17, x17
-        adc	x19, xzr, xzr
-
-// Add it back to the buffer
-
-        ldp	x2, x3, [x0, #32]
-        adds	x10, x10, x2
-        adcs	x11, x11, x3
-        stp	x10, x11, [x0, #32]
-
-        ldp	x2, x3, [x0, #48]
-        adcs	x12, x12, x2
-        adcs	x13, x13, x3
-        stp	x12, x13, [x0, #48]
-
-        ldp	x2, x3, [x0, #64]
-        adcs	x14, x14, x2
-        adcs	x15, x15, x3
-        stp	x14, x15, [x0, #64]
-
-        ldp	x2, x3, [x0, #80]
-        adcs	x16, x16, x2
-        adcs	x17, x17, x3
-        stp	x16, x17, [x0, #80]
-
-        ldp	x2, x3, [x0, #96]
-        adcs	x2, x2, x19
-        adcs	x3, x3, xzr
-        stp	x2, x3, [x0, #96]
-
-        ldp	x2, x3, [x0, #112]
-        adcs	x2, x2, xzr
-        adc	x3, x3, xzr
-        stp	x2, x3, [x0, #112]
-
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S
deleted file mode 100644
index 1a5a7a0ffc4..00000000000
--- a/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S
+++ /dev/null
@@ -1,1303 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
-// Input x[4]; output z[4]
-//
-// extern void bignum_montinv_p256(uint64_t z[static 4],uint64_t x[static 4]);
-//
-// If the 4-digit input x is coprime to p_256, i.e. is not divisible
-// by it, returns z < p_256 such that x * z == 2^512 (mod p_256). This
-// is effectively "Montgomery inverse" because if we consider x and z as
-// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z
-// (both mod p_256) then X * Z == 1 (mod p_256). That is, this function
-// gives the analog of the modular inverse bignum_inv_p256 but with both
-// input and output in the Montgomery domain. Note that x does not need
-// to be reduced modulo p_256, but the output always is. If the input
-// is divisible (i.e. is 0 or p_256), then there can be no solution to
-// the congruence x * z == 2^512 (mod p_256), and z = 0 is returned.
-//
-// Standard ARM ABI: X0 = z, X1 = x
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p256)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p256)
-
-        .text
-        .balign 4
-
-// Size in bytes of a 64-bit word
-
-#define N 8
-
-// Used for the return pointer
-
-#define res x20
-
-// Loop counter and d = 2 * delta value for divstep
-
-#define i x21
-#define d x22
-
-// Registers used for matrix element magnitudes and signs
-
-#define m00 x10
-#define m01 x11
-#define m10 x12
-#define m11 x13
-#define s00 x14
-#define s01 x15
-#define s10 x16
-#define s11 x17
-
-// Initial carries for combinations
-
-#define car0 x9
-#define car1 x19
-
-// Input and output, plain registers treated according to pattern
-
-#define reg0 x0, #0
-#define reg1 x1, #0
-#define reg2 x2, #0
-#define reg3 x3, #0
-#define reg4 x4, #0
-
-#define x x1, #0
-#define z x0, #0
-
-// Pointer-offset pairs for temporaries on stack
-
-#define f sp, #0
-#define g sp, #(6*N)
-#define u sp, #(12*N)
-#define v sp, #(16*N)
-
-// Total size to reserve on the stack
-
-#define NSPACE #(20*N)
-
-// Loading large constants
-
-#define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
-        movk    nn, n3, lsl #48
-
-// ---------------------------------------------------------------------------
-// Core signed almost-Montgomery reduction macro. Takes input in
-// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to
-// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally
-// as well as t0, t1, t2. This is almost-Montgomery, i.e. the result fits
-// in 4 digits but is not necessarily strictly reduced mod p_256.
-// ---------------------------------------------------------------------------
-
-#define amontred(d4,d3,d2,d1,d0, t2,t1,t0)                                  \
-/* We only know the input is -2^316 < x < 2^316. To do traditional  */      \
-/* unsigned Montgomery reduction, start by adding 2^61 * p_256.     */      \
-        mov     t0, #0xe000000000000000;                            \
-        adds    d0, d0, t0;                                         \
-        sbcs    d1, d1, xzr;                                        \
-        mov     t1, #0x000000001fffffff;                            \
-        adcs    d2, d2, t1;                                         \
-        mov     t2, #0x2000000000000000;                            \
-        adcs    d3, d3, t2;                                         \
-        mov     t0, #0x1fffffffe0000000;                            \
-        adc     d4, d4, t0;                                         \
-/* Let w = d0, the original word we use as offset; d0 gets recycled */      \
-/* First let [t2;t1] = 2^32 * w                                     */      \
-/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0)      */      \
-        lsl     t1, d0, #32;                                        \
-        subs    t0, d0, t1;                                         \
-        lsr     t2, d0, #32;                                        \
-        sbc     d0, d0, t2;                                         \
-/* Hence basic [d4;d3;d2;d1] += (2^256 - 2^224 + 2^192 + 2^96) * w  */      \
-        adds    d1, d1, t1;                                         \
-        adcs    d2, d2, t2;                                         \
-        adcs    d3, d3, t0;                                         \
-        adcs    d4, d4, d0;                                         \
-/* Now capture top carry and subtract p_256 if set (almost-Montgomery) */   \
-        mov     t0, #0xffffffffffffffff;                            \
-        mov     t1, #0x00000000ffffffff;                            \
-        mov     t2, #0xffffffff00000001;                            \
-        csel    t0, t0, xzr, cs;                                    \
-        csel    t1, t1, xzr, cs;                                    \
-        csel    t2, t2, xzr, cs;                                    \
-        subs    d1, d1, t0;                                         \
-        sbcs    d2, d2, t1;                                         \
-        sbcs    d3, d3, xzr;                                        \
-        sbc     d4, d4, t2
-
-// Very similar to a subroutine call to the s2n-bignum word_divstep59.
-// But different in register usage and returning the final matrix in
-// registers as follows
-//
-// [ m00  m01]
-// [ m10  m11]
-
-#define divstep59()                                                     \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x8, x4, #0x100, lsl #12;                                \
-        sbfx    x8, x8, #21, #21;                                       \
-        mov     x11, #0x100000;                                         \
-        add     x11, x11, x11, lsl #21;                                 \
-        add     x9, x4, x11;                                            \
-        asr     x9, x9, #42;                                            \
-        add     x10, x5, #0x100, lsl #12;                               \
-        sbfx    x10, x10, #21, #21;                                     \
-        add     x11, x5, x11;                                           \
-        asr     x11, x11, #42;                                          \
-        mul     x6, x8, x2;                                             \
-        mul     x7, x9, x3;                                             \
-        mul     x2, x10, x2;                                            \
-        mul     x3, x11, x3;                                            \
-        add     x4, x6, x7;                                             \
-        add     x5, x2, x3;                                             \
-        asr     x2, x4, #20;                                            \
-        asr     x3, x5, #20;                                            \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x12, x4, #0x100, lsl #12;                               \
-        sbfx    x12, x12, #21, #21;                                     \
-        mov     x15, #0x100000;                                         \
-        add     x15, x15, x15, lsl #21;                                 \
-        add     x13, x4, x15;                                           \
-        asr     x13, x13, #42;                                          \
-        add     x14, x5, #0x100, lsl #12;                               \
-        sbfx    x14, x14, #21, #21;                                     \
-        add     x15, x5, x15;                                           \
-        asr     x15, x15, #42;                                          \
-        mul     x6, x12, x2;                                            \
-        mul     x7, x13, x3;                                            \
-        mul     x2, x14, x2;                                            \
-        mul     x3, x15, x3;                                            \
-        add     x4, x6, x7;                                             \
-        add     x5, x2, x3;                                             \
-        asr     x2, x4, #20;                                            \
-        asr     x3, x5, #20;                                            \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        mul     x2, x12, x8;                                            \
-        mul     x3, x12, x9;                                            \
-        mul     x6, x14, x8;                                            \
-        mul     x7, x14, x9;                                            \
-        madd    x8, x13, x10, x2;                                       \
-        madd    x9, x13, x11, x3;                                       \
-        madd    x16, x15, x10, x6;                                      \
-        madd    x17, x15, x11, x7;                                      \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x12, x4, #0x100, lsl #12;                               \
-        sbfx    x12, x12, #22, #21;                                     \
-        mov     x15, #0x100000;                                         \
-        add     x15, x15, x15, lsl #21;                                 \
-        add     x13, x4, x15;                                           \
-        asr     x13, x13, #43;                                          \
-        add     x14, x5, #0x100, lsl #12;                               \
-        sbfx    x14, x14, #22, #21;                                     \
-        add     x15, x5, x15;                                           \
-        asr     x15, x15, #43;                                          \
-        mneg    x2, x12, x8;                                            \
-        mneg    x3, x12, x9;                                            \
-        mneg    x4, x14, x8;                                            \
-        mneg    x5, x14, x9;                                            \
-        msub    m00, x13, x16, x2;                                      \
-        msub    m01, x13, x17, x3;                                      \
-        msub    m10, x15, x16, x4;                                      \
-        msub    m11, x15, x17, x5
-
-S2N_BN_SYMBOL(bignum_montinv_p256):
-
-// Save registers and make room for temporaries
-
-        stp     x19, x20, [sp, -16]!
-        stp     x21, x22, [sp, -16]!
-        stp     x23, x24, [sp, -16]!
-        sub     sp, sp, NSPACE
-
-// Save the return pointer for the end so we can overwrite x0 later
-
-        mov     res, x0
-
-// Copy the prime and input into the main f and g variables respectively.
-// Make sure x is reduced so that g <= f as assumed in the bound proof.
-
-        mov     x10, #0xffffffffffffffff
-        mov     x11, #0x00000000ffffffff
-        mov     x13, #0xffffffff00000001
-        stp     x10, x11, [f]
-        stp     xzr, x13, [f+2*N]
-        str     xzr, [f+4*N]
-
-        ldp     x2, x3, [x1]
-        subs    x10, x2, x10
-        sbcs    x11, x3, x11
-        ldp     x4, x5, [x1, #(2*N)]
-        sbcs    x12, x4, xzr
-        sbcs    x13, x5, x13
-
-        csel    x2, x2, x10, cc
-        csel    x3, x3, x11, cc
-        csel    x4, x4, x12, cc
-        csel    x5, x5, x13, cc
-
-        stp     x2, x3, [g]
-        stp     x4, x5, [g+2*N]
-        str     xzr, [g+4*N]
-
-// Also maintain reduced < 2^256 vector [u,v] such that
-// [f,g] == x * 2^{5*i-562} * [u,v] (mod p_256)
-// starting with [p_256,x] == x * 2^{5*0-562} * [0,2^562] (mod p_256)
-// The weird-looking 5*i modifications come in because we are doing
-// 64-bit word-sized Montgomery reductions at each stage, which is
-// 5 bits more than the 59-bit requirement to keep things stable.
-// After the 10th and last iteration and sign adjustment, when
-// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e.
-// x * u == 2^512 as required.
-
-        stp     xzr, xzr, [u]
-        stp     xzr, xzr, [u+2*N]
-
-// The starting constant 2^562 mod p_256 is
-// 0x000bffffffebffff:fffbffffffefffff:ffe8000000000000:000c000000140000
-// where colons separate 64-bit subwords, least significant at the right.
-// Only word number 1, value 0xffe8000000000000, is a single ARM move.
-
-        mov     x10, #0x0000000000140000
-        orr     x10, x10, #0x000c000000000000
-
-        mov     x11, #0xffe8000000000000
-
-        movbig(x13, #0x000b, #0xffff, #0xffef, #0xffff)
-        orr     x12, x13, #0xfff0000000000000
-        and     x13, x13, #0xfffffffffffbffff
-
-        stp     x10, x11, [v]
-        stp     x12, x13, [v+2*N]
-
-// Start of main loop. We jump into the middle so that the divstep
-// portion is common to the special tenth iteration after a uniform
-// first 9.
-
-        mov     i, #10
-        mov     d, #1
-        b       bignum_montinv_p256_midloop
-
-bignum_montinv_p256_loop:
-
-// Separate the matrix elements into sign-magnitude pairs
-
-        cmp     m00, xzr
-        csetm   s00, mi
-        cneg    m00, m00, mi
-
-        cmp     m01, xzr
-        csetm   s01, mi
-        cneg    m01, m01, mi
-
-        cmp     m10, xzr
-        csetm   s10, mi
-        cneg    m10, m10, mi
-
-        cmp     m11, xzr
-        csetm   s11, mi
-        cneg    m11, m11, mi
-
-// Adjust the initial values to allow for complement instead of negation
-// This initial offset is the same for [f,g] and [u,v] compositions.
-// Save it in stable registers for the [u,v] part and do [f,g] first.
-
-        and     x0, m00, s00
-        and     x1, m01, s01
-        add     car0, x0, x1
-
-        and     x0, m10, s10
-        and     x1, m11, s11
-        add     car1, x0, x1
-
-// Now the computation of the updated f and g values. This maintains a
-// 2-word carry between stages so we can conveniently insert the shift
-// right by 59 before storing back, and not overwrite digits we need
-// again of the old f and g values.
-//
-// Digit 0 of [f,g]
-
-        ldr     x7, [f]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [g]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, car1, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-
-// Digit 1 of [f,g]
-
-        ldr     x7, [f+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [g+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        adc     x6, x6, x1
-        extr    x4, x2, x4, #59
-        str     x4, [f]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        adc     x4, x4, x1
-        extr    x5, x3, x5, #59
-        str     x5, [g]
-
-// Digit 2 of [f,g]
-
-        ldr     x7, [f+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [g+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        adc     x5, x5, x1
-        extr    x2, x6, x2, #59
-        str     x2, [f+N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        extr    x3, x4, x3, #59
-        str     x3, [g+N]
-
-// Digits 3 and 4 of [f,g]
-
-        ldr     x7, [f+3*N]
-        eor     x1, x7, s00
-        ldr     x23, [f+4*N]
-        eor     x3, x23, s00
-        and     x3, x3, m00
-        neg     x3, x3
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        ldr     x8, [g+3*N]
-        eor     x1, x8, s01
-        ldr     x24, [g+4*N]
-        eor     x0, x24, s01
-        and     x0, x0, m01
-        sub     x3, x3, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        extr    x6, x5, x6, #59
-        str     x6, [f+2*N]
-        extr    x5, x3, x5, #59
-        str     x5, [f+3*N]
-        asr     x3, x3, #59
-        str     x3, [f+4*N]
-
-        eor     x1, x7, s10
-        eor     x5, x23, s10
-        and     x5, x5, m10
-        neg     x5, x5
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x5, x5, x1
-        eor     x1, x8, s11
-        eor     x0, x24, s11
-        and     x0, x0, m11
-        sub     x5, x5, x0
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        adc     x5, x5, x1
-        extr    x4, x2, x4, #59
-        str     x4, [g+2*N]
-        extr    x2, x5, x2, #59
-        str     x2, [g+3*N]
-        asr     x5, x5, #59
-        str     x5, [g+4*N]
-
-// Now the computation of the updated u and v values and their
-// Montgomery reductions. A very similar accumulation except that
-// the top words of u and v are unsigned and we don't shift.
-//
-// Digit 0 of [u,v]
-
-        ldr     x7, [u]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u]
-        adc     x2, x2, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, car1, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        str     x5, [v]
-        adc     x3, x3, x1
-
-// Digit 1 of [u,v]
-
-        ldr     x7, [u+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+N]
-        adc     x6, x6, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        str     x3, [v+N]
-        adc     x4, x4, x1
-
-// Digit 2 of [u,v]
-
-        ldr     x7, [u+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+2*N]
-        adc     x5, x5, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        str     x4, [v+2*N]
-        adc     x2, x2, x1
-
-// Digits 3 and 4 of u (top is unsigned)
-
-        ldr     x7, [u+3*N]
-        eor     x1, x7, s00
-        and     x3, s00, m00
-        neg     x3, x3
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        ldr     x8, [v+3*N]
-        eor     x1, x8, s01
-        and     x0, s01, m01
-        sub     x3, x3, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-
-// Montgomery reduction of u
-
-        ldp     x0, x1, [u]
-        ldr     x6, [u+2*N]
-        amontred(x3,x5,x6,x1,x0, x10,x11,x14)
-        stp     x1, x6, [u]
-        stp     x5, x3, [u+16]
-
-// Digits 3 and 4 of v (top is unsigned)
-
-        eor     x1, x7, s10
-        and     x5, s10, m10
-        neg     x5, x5
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x5, x5, x1
-        eor     x1, x8, s11
-        and     x0, s11, m11
-        sub     x5, x5, x0
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        adc     x5, x5, x1
-
-// Montgomery reduction of v
-
-        ldp     x0, x1, [v]
-        ldr     x3, [v+2*N]
-        amontred(x5,x2,x3,x1,x0, x10,x11,x14)
-        stp     x1, x3, [v]
-        stp     x2, x5, [v+16]
-
-bignum_montinv_p256_midloop:
-
-        mov     x1, d
-        ldr     x2, [f]
-        ldr     x3, [g]
-        divstep59()
-        mov     d, x1
-
-// Next iteration
-
-        subs    i, i, #1
-        bne     bignum_montinv_p256_loop
-
-// The 10th and last iteration does not need anything except the
-// u value and the sign of f; the latter can be obtained from the
-// lowest word of f. So it's done differently from the main loop.
-// Find the sign of the new f. For this we just need one digit
-// since we know (for in-scope cases) that f is either +1 or -1.
-// We don't explicitly shift right by 59 either, but looking at
-// bit 63 (or any bit >= 60) of the unshifted result is enough
-// to distinguish -1 from +1; this is then made into a mask.
-
-        ldr     x0, [f]
-        ldr     x1, [g]
-        mul     x0, x0, m00
-        madd    x1, x1, m01, x0
-        asr     x0, x1, #63
-
-// Now separate out the matrix into sign-magnitude pairs
-// and adjust each one based on the sign of f.
-//
-// Note that at this point we expect |f|=1 and we got its
-// sign above, so then since [f,0] == x * 2^{-512} [u,v] (mod p_256)
-// we want to flip the sign of u according to that of f.
-
-        cmp     m00, xzr
-        csetm   s00, mi
-        cneg    m00, m00, mi
-        eor     s00, s00, x0
-
-        cmp     m01, xzr
-        csetm   s01, mi
-        cneg    m01, m01, mi
-        eor     s01, s01, x0
-
-        cmp     m10, xzr
-        csetm   s10, mi
-        cneg    m10, m10, mi
-        eor     s10, s10, x0
-
-        cmp     m11, xzr
-        csetm   s11, mi
-        cneg    m11, m11, mi
-        eor     s11, s11, x0
-
-// Adjust the initial value to allow for complement instead of negation
-
-        and     x0, m00, s00
-        and     x1, m01, s01
-        add     car0, x0, x1
-
-// Digit 0 of [u]
-
-        ldr     x7, [u]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u]
-        adc     x2, x2, x1
-
-// Digit 1 of [u]
-
-        ldr     x7, [u+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+N]
-        adc     x6, x6, x1
-
-// Digit 2 of [u]
-
-        ldr     x7, [u+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+2*N]
-        adc     x5, x5, x1
-
-// Digits 3 and 4 of u (top is unsigned)
-
-        ldr     x7, [u+3*N]
-        eor     x1, x7, s00
-        and     x3, s00, m00
-        neg     x3, x3
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        ldr     x8, [v+3*N]
-        eor     x1, x8, s01
-        and     x0, s01, m01
-        sub     x3, x3, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-
-// Montgomery reduction of u. This needs to be strict not "almost"
-// so it is followed by an optional subtraction of p_256
-
-        ldp     x0, x1, [u]
-        ldr     x2, [u+2*N]
-        amontred(x3,x5,x2,x1,x0, x10,x11,x14)
-
-        mov     x10, #0xffffffffffffffff
-        subs    x10, x1, x10
-        mov     x11, #0x00000000ffffffff
-        sbcs    x11, x2, x11
-        mov     x13, #0xffffffff00000001
-        sbcs    x12, x5, xzr
-        sbcs    x13, x3, x13
-
-        csel    x10, x1, x10, cc
-        csel    x11, x2, x11, cc
-        csel    x12, x5, x12, cc
-        csel    x13, x3, x13, cc
-
-// Store it back to the final output
-
-        stp     x10, x11, [res]
-        stp     x12, x13, [res, #16]
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p384/bignum_inv_p384.S b/third_party/s2n-bignum/arm/p384/bignum_inv_p384.S
deleted file mode 100644
index 085224172ea..00000000000
--- a/third_party/s2n-bignum/arm/p384/bignum_inv_p384.S
+++ /dev/null
@@ -1,1469 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
-// Input x[6]; output z[6]
-//
-// extern void bignum_inv_p384(uint64_t z[static 6],uint64_t x[static 6]);
-//
-// If the 6-digit input x is coprime to p_384, i.e. is not divisible
-// by it, returns z < p_384 such that x * z == 1 (mod p_384). Note that
-// x does not need to be reduced modulo p_384, but the output always is.
-// If the input is divisible (i.e. is 0 or p_384), then there can be no
-// modular inverse and z = 0 is returned.
-//
-// Standard ARM ABI: X0 = z, X1 = x
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p384)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p384)
-
-        .text
-        .balign 4
-
-// Size in bytes of a 64-bit word
-
-#define N 8
-
-// Used for the return pointer
-
-#define res x20
-
-// Loop counter and d = 2 * delta value for divstep
-
-#define i x21
-#define d x22
-
-// Registers used for matrix element magnitudes and signs
-
-#define m00 x10
-#define m01 x11
-#define m10 x12
-#define m11 x13
-#define s00 x14
-#define s01 x15
-#define s10 x16
-#define s11 x17
-
-// Initial carries for combinations
-
-#define car0 x9
-#define car1 x19
-
-// Input and output, plain registers treated according to pattern
-
-#define reg0 x0, #0
-#define reg1 x1, #0
-#define reg2 x2, #0
-#define reg3 x3, #0
-#define reg4 x4, #0
-
-#define x x1, #0
-#define z x0, #0
-
-// Pointer-offset pairs for temporaries on stack
-// The u and v variables are 6 words each as expected, but the f and g
-// variables are 8 words each -- they need to have at least one extra
-// word for a sign word, and to preserve alignment we "round up" to 8.
-// In fact, we currently keep an extra word in u and v as well.
-
-#define f sp, #0
-#define g sp, #(8*N)
-#define u sp, #(16*N)
-#define v sp, #(24*N)
-
-// Total size to reserve on the stack
-
-#define NSPACE #(32*N)
-
-// ---------------------------------------------------------------------------
-// Core signed almost-Montgomery reduction macro. Takes input in
-// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding
-// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary
-// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the
-// result fits in 6 digits but is not necessarily strictly reduced mod p_384.
-// ---------------------------------------------------------------------------
-
-#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
-/* We only know the input is -2^444 < x < 2^444. To do traditional  */      \
-/* unsigned Montgomery reduction, start by adding 2^61 * p_384.     */      \
-        mov     t1, #0xe000000000000000;                            \
-        adds    d0, d0, t1;                                         \
-        mov     t2, #0x000000001fffffff;                            \
-        adcs    d1, d1, t2;                                         \
-        mov     t3, #0xffffffffe0000000;                            \
-        bic     t3, t3, #0x2000000000000000;                        \
-        adcs    d2, d2, t3;                                         \
-        sbcs    d3, d3, xzr;                                        \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
-        mov     t1, #0x1fffffffffffffff;                            \
-        adc     d6, d6, t1;                                         \
-/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64  */    \
-/* Store it back into d0 since we no longer need that digit.  */    \
-        add     d0, d0, d0, lsl #32;                                \
-/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w                 */    \
-/* We know the lowest word will cancel d0 so we don't need it */    \
-        mov     t1, #0xffffffff00000001;                            \
-        umulh   t1, t1, d0;                                         \
-        mov     t2, #0x00000000ffffffff;                            \
-        mul     t3, t2, d0;                                         \
-        umulh   t2, t2, d0;                                         \
-        adds    t1, t1, t3;                                         \
-        adcs    t2, t2, d0;                                         \
-        cset    t3, cs;                                             \
-/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */     \
-/* We catch the net top carry from add-subtract in the digit d0 */  \
-        adds    d6, d6, d0;                                         \
-        cset    d0, cs;                                             \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
-        sbcs    d6, d6, xzr;                                        \
-        sbcs    d0, d0, xzr;                                        \
-/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */    \
-        neg     d0, d0;                                             \
-        and     t1, d0, #0x00000000ffffffff;                        \
-        and     t2, d0, #0xffffffff00000000;                        \
-        and     t3, d0, #0xfffffffffffffffe;                        \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, d0;                                         \
-        sbcs    d5, d5, d0;                                         \
-        sbc     d6, d6, d0
-
-// Very similar to a subroutine call to the s2n-bignum word_divstep59.
-// But different in register usage and returning the final matrix in
-// registers as follows
-//
-// [ m00  m01]
-// [ m10  m11]
-
-#define divstep59()                                                     \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x8, x4, #0x100, lsl #12;                                \
-        sbfx    x8, x8, #21, #21;                                       \
-        mov     x11, #0x100000;                                         \
-        add     x11, x11, x11, lsl #21;                                 \
-        add     x9, x4, x11;                                            \
-        asr     x9, x9, #42;                                            \
-        add     x10, x5, #0x100, lsl #12;                               \
-        sbfx    x10, x10, #21, #21;                                     \
-        add     x11, x5, x11;                                           \
-        asr     x11, x11, #42;                                          \
-        mul     x6, x8, x2;                                             \
-        mul     x7, x9, x3;                                             \
-        mul     x2, x10, x2;                                            \
-        mul     x3, x11, x3;                                            \
-        add     x4, x6, x7;                                             \
-        add     x5, x2, x3;                                             \
-        asr     x2, x4, #20;                                            \
-        asr     x3, x5, #20;                                            \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x12, x4, #0x100, lsl #12;                               \
-        sbfx    x12, x12, #21, #21;                                     \
-        mov     x15, #0x100000;                                         \
-        add     x15, x15, x15, lsl #21;                                 \
-        add     x13, x4, x15;                                           \
-        asr     x13, x13, #42;                                          \
-        add     x14, x5, #0x100, lsl #12;                               \
-        sbfx    x14, x14, #21, #21;                                     \
-        add     x15, x5, x15;                                           \
-        asr     x15, x15, #42;                                          \
-        mul     x6, x12, x2;                                            \
-        mul     x7, x13, x3;                                            \
-        mul     x2, x14, x2;                                            \
-        mul     x3, x15, x3;                                            \
-        add     x4, x6, x7;                                             \
-        add     x5, x2, x3;                                             \
-        asr     x2, x4, #20;                                            \
-        asr     x3, x5, #20;                                            \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        mul     x2, x12, x8;                                            \
-        mul     x3, x12, x9;                                            \
-        mul     x6, x14, x8;                                            \
-        mul     x7, x14, x9;                                            \
-        madd    x8, x13, x10, x2;                                       \
-        madd    x9, x13, x11, x3;                                       \
-        madd    x16, x15, x10, x6;                                      \
-        madd    x17, x15, x11, x7;                                      \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x12, x4, #0x100, lsl #12;                               \
-        sbfx    x12, x12, #22, #21;                                     \
-        mov     x15, #0x100000;                                         \
-        add     x15, x15, x15, lsl #21;                                 \
-        add     x13, x4, x15;                                           \
-        asr     x13, x13, #43;                                          \
-        add     x14, x5, #0x100, lsl #12;                               \
-        sbfx    x14, x14, #22, #21;                                     \
-        add     x15, x5, x15;                                           \
-        asr     x15, x15, #43;                                          \
-        mneg    x2, x12, x8;                                            \
-        mneg    x3, x12, x9;                                            \
-        mneg    x4, x14, x8;                                            \
-        mneg    x5, x14, x9;                                            \
-        msub    m00, x13, x16, x2;                                      \
-        msub    m01, x13, x17, x3;                                      \
-        msub    m10, x15, x16, x4;                                      \
-        msub    m11, x15, x17, x5
-
-S2N_BN_SYMBOL(bignum_inv_p384):
-
-// Save registers and make room for temporaries
-
-        stp     x19, x20, [sp, -16]!
-        stp     x21, x22, [sp, -16]!
-        stp     x23, x24, [sp, -16]!
-        sub     sp, sp, NSPACE
-
-// Save the return pointer for the end so we can overwrite x0 later
-
-        mov     res, x0
-
-// Copy the prime and input into the main f and g variables respectively.
-// Make sure x is reduced so that g <= f as assumed in the bound proof.
-
-        mov     x10, #0x00000000ffffffff
-        mov     x11, #0xffffffff00000000
-        mov     x12, #0xfffffffffffffffe
-        mov     x15, #0xffffffffffffffff
-        stp     x10, x11, [f]
-        stp     x12, x15, [f+2*N]
-        stp     x15, x15, [f+4*N]
-        str     xzr, [f+6*N]
-
-        ldp     x2, x3, [x1]
-        subs    x10, x2, x10
-        sbcs    x11, x3, x11
-        ldp     x4, x5, [x1, #(2*N)]
-        sbcs    x12, x4, x12
-        sbcs    x13, x5, x15
-        ldp     x6, x7, [x1, #(4*N)]
-        sbcs    x14, x6, x15
-        sbcs    x15, x7, x15
-
-        csel    x2, x2, x10, cc
-        csel    x3, x3, x11, cc
-        csel    x4, x4, x12, cc
-        csel    x5, x5, x13, cc
-        csel    x6, x6, x14, cc
-        csel    x7, x7, x15, cc
-
-        stp     x2, x3, [g]
-        stp     x4, x5, [g+2*N]
-        stp     x6, x7, [g+4*N]
-        str     xzr, [g+6*N]
-
-// Also maintain reduced < 2^384 vector [u,v] such that
-// [f,g] == x * 2^{5*i-75} * [u,v] (mod p_384)
-// starting with [p_384,x] == x * 2^{5*0-75} * [0,2^75] (mod p_384)
-// The weird-looking 5*i modifications come in because we are doing
-// 64-bit word-sized Montgomery reductions at each stage, which is
-// 5 bits more than the 59-bit requirement to keep things stable.
-
-        stp     xzr, xzr, [u]
-        stp     xzr, xzr, [u+2*N]
-        stp     xzr, xzr, [u+4*N]
-
-        mov     x10, #2048
-        stp     xzr, x10, [v]
-        stp     xzr, xzr, [v+2*N]
-        stp     xzr, xzr, [v+4*N]
-
-// Start of main loop. We jump into the middle so that the divstep
-// portion is common to the special fifteenth iteration after a uniform
-// first 14.
-
-        mov     i, #15
-        mov     d, #1
-        b       midloop
-
-loop:
-
-// Separate the matrix elements into sign-magnitude pairs
-
-        cmp     m00, xzr
-        csetm   s00, mi
-        cneg    m00, m00, mi
-
-        cmp     m01, xzr
-        csetm   s01, mi
-        cneg    m01, m01, mi
-
-        cmp     m10, xzr
-        csetm   s10, mi
-        cneg    m10, m10, mi
-
-        cmp     m11, xzr
-        csetm   s11, mi
-        cneg    m11, m11, mi
-
-// Adjust the initial values to allow for complement instead of negation
-// This initial offset is the same for [f,g] and [u,v] compositions.
-// Save it in stable registers for the [u,v] part and do [f,g] first.
-
-        and     x0, m00, s00
-        and     x1, m01, s01
-        add     car0, x0, x1
-
-        and     x0, m10, s10
-        and     x1, m11, s11
-        add     car1, x0, x1
-
-// Now the computation of the updated f and g values. This maintains a
-// 2-word carry between stages so we can conveniently insert the shift
-// right by 59 before storing back, and not overwrite digits we need
-// again of the old f and g values.
-//
-// Digit 0 of [f,g]
-
-        ldr     x7, [f]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [g]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, car1, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-
-// Digit 1 of [f,g]
-
-        ldr     x7, [f+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [g+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        adc     x6, x6, x1
-        extr    x4, x2, x4, #59
-        str     x4, [f]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        adc     x4, x4, x1
-        extr    x5, x3, x5, #59
-        str     x5, [g]
-
-// Digit 2 of [f,g]
-
-        ldr     x7, [f+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [g+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        adc     x5, x5, x1
-        extr    x2, x6, x2, #59
-        str     x2, [f+N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        extr    x3, x4, x3, #59
-        str     x3, [g+N]
-
-// Digit 3 of [f,g]
-
-        ldr     x7, [f+3*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        ldr     x8, [g+3*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        extr    x6, x5, x6, #59
-        str     x6, [f+2*N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        adc     x6, x6, x1
-        extr    x4, x2, x4, #59
-        str     x4, [g+2*N]
-
-// Digit 4 of [f,g]
-
-        ldr     x7, [f+4*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        ldr     x8, [g+4*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x3, x3, x0
-        adc     x4, x4, x1
-        extr    x5, x3, x5, #59
-        str     x5, [f+3*N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x6, x6, x0
-        adc     x5, x5, x1
-        extr    x2, x6, x2, #59
-        str     x2, [g+3*N]
-
-// Digits 5 and 6 of [f,g]
-
-        ldr     x7, [f+5*N]
-        eor     x1, x7, s00
-        ldr     x23, [f+6*N]
-        eor     x2, x23, s00
-        and     x2, x2, m00
-        neg     x2, x2
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        ldr     x8, [g+5*N]
-        eor     x1, x8, s01
-        ldr     x24, [g+6*N]
-        eor     x0, x24, s01
-        and     x0, x0, m01
-        sub     x2, x2, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        extr    x3, x4, x3, #59
-        str     x3, [f+4*N]
-        extr    x4, x2, x4, #59
-        str     x4, [f+5*N]
-        asr     x2, x2, #59
-        str     x2, [f+6*N]
-
-        eor     x1, x7, s10
-        eor     x4, x23, s10
-        and     x4, x4, m10
-        neg     x4, x4
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, x5, x0
-        adc     x4, x4, x1
-        eor     x1, x8, s11
-        eor     x0, x24, s11
-        and     x0, x0, m11
-        sub     x4, x4, x0
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        adc     x4, x4, x1
-        extr    x6, x5, x6, #59
-        str     x6, [g+4*N]
-        extr    x5, x4, x5, #59
-        str     x5, [g+5*N]
-        asr     x4, x4, #59
-        str     x4, [g+6*N]
-
-// Now the computation of the updated u and v values and their
-// Montgomery reductions. A very similar accumulation except that
-// the top words of u and v are unsigned and we don't shift.
-//
-// Digit 0 of [u,v]
-
-        ldr     x7, [u]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u]
-        adc     x2, x2, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, car1, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        str     x5, [v]
-        adc     x3, x3, x1
-
-// Digit 1 of [u,v]
-
-        ldr     x7, [u+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+N]
-        adc     x6, x6, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        str     x3, [v+N]
-        adc     x4, x4, x1
-
-// Digit 2 of [u,v]
-
-        ldr     x7, [u+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+2*N]
-        adc     x5, x5, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        str     x4, [v+2*N]
-        adc     x2, x2, x1
-
-// Digit 3 of [u,v]
-
-        ldr     x7, [u+3*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        ldr     x8, [v+3*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        str     x5, [u+3*N]
-        adc     x3, x3, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        str     x2, [v+3*N]
-        adc     x6, x6, x1
-
-// Digit 4 of [u,v]
-
-        ldr     x7, [u+4*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        ldr     x8, [v+4*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x3, x3, x0
-        str     x3, [u+4*N]
-        adc     x4, x4, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x6, x6, x0
-        str     x6, [v+4*N]
-        adc     x5, x5, x1
-
-// Digits 5 and 6 of [u,v] (top is unsigned)
-
-        ldr     x7, [u+5*N]
-        eor     x1, x7, s00
-        and     x2, s00, m00
-        neg     x2, x2
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        ldr     x8, [v+5*N]
-        eor     x1, x8, s01
-        and     x0, s01, m01
-        sub     x2, x2, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u+5*N]
-        adc     x2, x2, x1
-        str     x2, [u+6*N]
-
-        eor     x1, x7, s10
-        and     x4, s10, m10
-        neg     x4, x4
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, x5, x0
-        adc     x4, x4, x1
-        eor     x1, x8, s11
-        and     x0, s11, m11
-        sub     x4, x4, x0
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        str     x5, [v+5*N]
-        adc     x4, x4, x1
-        str     x4, [v+6*N]
-
-// Montgomery reduction of u
-
-        ldp     x0, x1, [u]
-        ldp     x2, x3, [u+16]
-        ldp     x4, x5, [u+32]
-        ldr     x6, [u+48]
-        amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7)
-        stp     x1, x2, [u]
-        stp     x3, x4, [u+16]
-        stp     x5, x6, [u+32]
-
-// Montgomery reduction of v
-
-        ldp     x0, x1, [v]
-        ldp     x2, x3, [v+16]
-        ldp     x4, x5, [v+32]
-        ldr     x6, [v+48]
-        amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7)
-        stp     x1, x2, [v]
-        stp     x3, x4, [v+16]
-        stp     x5, x6, [v+32]
-
-midloop:
-
-        mov     x1, d
-        ldr     x2, [f]
-        ldr     x3, [g]
-        divstep59()
-        mov     d, x1
-
-// Next iteration
-
-        subs    i, i, #1
-        bne     loop
-
-// The 15th and last iteration does not need anything except the
-// u value and the sign of f; the latter can be obtained from the
-// lowest word of f. So it's done differently from the main loop.
-// Find the sign of the new f. For this we just need one digit
-// since we know (for in-scope cases) that f is either +1 or -1.
-// We don't explicitly shift right by 59 either, but looking at
-// bit 63 (or any bit >= 60) of the unshifted result is enough
-// to distinguish -1 from +1; this is then made into a mask.
-
-        ldr     x0, [f]
-        ldr     x1, [g]
-        mul     x0, x0, m00
-        madd    x1, x1, m01, x0
-        asr     x0, x1, #63
-
-// Now separate out the matrix into sign-magnitude pairs
-// and adjust each one based on the sign of f.
-//
-// Note that at this point we expect |f|=1 and we got its
-// sign above, so then since [f,0] == x * [u,v] (mod p_384)
-// we want to flip the sign of u according to that of f.
-
-        cmp     m00, xzr
-        csetm   s00, mi
-        cneg    m00, m00, mi
-        eor     s00, s00, x0
-
-        cmp     m01, xzr
-        csetm   s01, mi
-        cneg    m01, m01, mi
-        eor     s01, s01, x0
-
-        cmp     m10, xzr
-        csetm   s10, mi
-        cneg    m10, m10, mi
-        eor     s10, s10, x0
-
-        cmp     m11, xzr
-        csetm   s11, mi
-        cneg    m11, m11, mi
-        eor     s11, s11, x0
-
-// Adjust the initial value to allow for complement instead of negation
-
-        and     x0, m00, s00
-        and     x1, m01, s01
-        add     car0, x0, x1
-
-// Digit 0 of [u]
-
-        ldr     x7, [u]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u]
-        adc     x2, x2, x1
-
-// Digit 1 of [u]
-
-        ldr     x7, [u+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+N]
-        adc     x6, x6, x1
-
-// Digit 2 of [u]
-
-        ldr     x7, [u+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+2*N]
-        adc     x5, x5, x1
-
-// Digit 3 of [u]
-
-        ldr     x7, [u+3*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        ldr     x8, [v+3*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        str     x5, [u+3*N]
-        adc     x3, x3, x1
-
-// Digit 4 of [u]
-
-        ldr     x7, [u+4*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        ldr     x8, [v+4*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x3, x3, x0
-        str     x3, [u+4*N]
-        adc     x4, x4, x1
-
-// Digits 5 and 6 of [u] (top is unsigned)
-
-        ldr     x7, [u+5*N]
-        eor     x1, x7, s00
-        and     x2, s00, m00
-        neg     x2, x2
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        ldr     x8, [v+5*N]
-        eor     x1, x8, s01
-        and     x0, s01, m01
-        sub     x2, x2, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u+5*N]
-        adc     x2, x2, x1
-        str     x2, [u+6*N]
-
-// Montgomery reduction of u. This needs to be strict not "almost"
-// so it is followed by an optional subtraction of p_384
-
-        ldp     x10, x0, [u]
-        ldp     x1, x2, [u+16]
-        ldp     x3, x4, [u+32]
-        ldr     x5, [u+48]
-        amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7)
-
-        mov     x10, #0x00000000ffffffff
-        subs    x10, x0, x10
-        mov     x11, #0xffffffff00000000
-        sbcs    x11, x1, x11
-        mov     x12, #0xfffffffffffffffe
-        sbcs    x12, x2, x12
-        mov     x15, #0xffffffffffffffff
-        sbcs    x13, x3, x15
-        sbcs    x14, x4, x15
-        sbcs    x15, x5, x15
-
-        csel    x0, x0, x10, cc
-        csel    x1, x1, x11, cc
-        csel    x2, x2, x12, cc
-        csel    x3, x3, x13, cc
-        csel    x4, x4, x14, cc
-        csel    x5, x5, x15, cc
-
-// Store it back to the final output
-
-        stp     x0, x1, [res]
-        stp     x2, x3, [res, #16]
-        stp     x4, x5, [res, #32]
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p384/bignum_montinv_p384.S b/third_party/s2n-bignum/arm/p384/bignum_montinv_p384.S
deleted file mode 100644
index 79d59781196..00000000000
--- a/third_party/s2n-bignum/arm/p384/bignum_montinv_p384.S
+++ /dev/null
@@ -1,1487 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
-// Input x[6]; output z[6]
-//
-// extern void bignum_montinv_p384(uint64_t z[static 6],uint64_t x[static 6]);
-//
-// If the 6-digit input x is coprime to p_384, i.e. is not divisible
-// by it, returns z < p_384 such that x * z == 2^768 (mod p_384). This
-// is effectively "Montgomery inverse" because if we consider x and z as
-// Montgomery forms of X and Z, i.e. x == 2^384 * X and z == 2^384 * Z
-// (both mod p_384) then X * Z == 1 (mod p_384). That is, this function
-// gives the analog of the modular inverse bignum_inv_p384 but with both
-// input and output in the Montgomery domain. Note that x does not need
-// to be reduced modulo p_384, but the output always is. If the input
-// is divisible (i.e. is 0 or p_384), then there can be no solution to
-// the congruence x * z == 2^768 (mod p_384), and z = 0 is returned.
-//
-// Standard ARM ABI: X0 = z, X1 = x
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p384)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p384)
-
-        .text
-        .balign 4
-
-// Size in bytes of a 64-bit word
-
-#define N 8
-
-// Used for the return pointer
-
-#define res x20
-
-// Loop counter and d = 2 * delta value for divstep
-
-#define i x21
-#define d x22
-
-// Registers used for matrix element magnitudes and signs
-
-#define m00 x10
-#define m01 x11
-#define m10 x12
-#define m11 x13
-#define s00 x14
-#define s01 x15
-#define s10 x16
-#define s11 x17
-
-// Initial carries for combinations
-
-#define car0 x9
-#define car1 x19
-
-// Input and output, plain registers treated according to pattern
-
-#define reg0 x0, #0
-#define reg1 x1, #0
-#define reg2 x2, #0
-#define reg3 x3, #0
-#define reg4 x4, #0
-
-#define x x1, #0
-#define z x0, #0
-
-// Pointer-offset pairs for temporaries on stack
-// The u and v variables are 6 words each as expected, but the f and g
-// variables are 8 words each -- they need to have at least one extra
-// word for a sign word, and to preserve alignment we "round up" to 8.
-// In fact, we currently keep an extra word in u and v as well.
-
-#define f sp, #0
-#define g sp, #(8*N)
-#define u sp, #(16*N)
-#define v sp, #(24*N)
-
-// Total size to reserve on the stack
-
-#define NSPACE #(32*N)
-
-// ---------------------------------------------------------------------------
-// Core signed almost-Montgomery reduction macro. Takes input in
-// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding
-// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary
-// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the
-// result fits in 6 digits but is not necessarily strictly reduced mod p_384.
-// ---------------------------------------------------------------------------
-
-#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
-/* We only know the input is -2^444 < x < 2^444. To do traditional  */      \
-/* unsigned Montgomery reduction, start by adding 2^61 * p_384.     */      \
-        mov     t1, #0xe000000000000000;                            \
-        adds    d0, d0, t1;                                         \
-        mov     t2, #0x000000001fffffff;                            \
-        adcs    d1, d1, t2;                                         \
-        mov     t3, #0xffffffffe0000000;                            \
-        bic     t3, t3, #0x2000000000000000;                        \
-        adcs    d2, d2, t3;                                         \
-        sbcs    d3, d3, xzr;                                        \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
-        mov     t1, #0x1fffffffffffffff;                            \
-        adc     d6, d6, t1;                                         \
-/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64  */    \
-/* Store it back into d0 since we no longer need that digit.  */    \
-        add     d0, d0, d0, lsl #32;                                \
-/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w                 */    \
-/* We know the lowest word will cancel d0 so we don't need it */    \
-        mov     t1, #0xffffffff00000001;                            \
-        umulh   t1, t1, d0;                                         \
-        mov     t2, #0x00000000ffffffff;                            \
-        mul     t3, t2, d0;                                         \
-        umulh   t2, t2, d0;                                         \
-        adds    t1, t1, t3;                                         \
-        adcs    t2, t2, d0;                                         \
-        cset    t3, cs;                                             \
-/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */     \
-/* We catch the net top carry from add-subtract in the digit d0 */  \
-        adds    d6, d6, d0;                                         \
-        cset    d0, cs;                                             \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
-        sbcs    d6, d6, xzr;                                        \
-        sbcs    d0, d0, xzr;                                        \
-/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */    \
-        neg     d0, d0;                                             \
-        and     t1, d0, #0x00000000ffffffff;                        \
-        and     t2, d0, #0xffffffff00000000;                        \
-        and     t3, d0, #0xfffffffffffffffe;                        \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, d0;                                         \
-        sbcs    d5, d5, d0;                                         \
-        sbc     d6, d6, d0
-
-// Very similar to a subroutine call to the s2n-bignum word_divstep59.
-// But different in register usage and returning the final matrix in
-// registers as follows
-//
-// [ m00  m01]
-// [ m10  m11]
-
-#define divstep59()                                                     \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x8, x4, #0x100, lsl #12;                                \
-        sbfx    x8, x8, #21, #21;                                       \
-        mov     x11, #0x100000;                                         \
-        add     x11, x11, x11, lsl #21;                                 \
-        add     x9, x4, x11;                                            \
-        asr     x9, x9, #42;                                            \
-        add     x10, x5, #0x100, lsl #12;                               \
-        sbfx    x10, x10, #21, #21;                                     \
-        add     x11, x5, x11;                                           \
-        asr     x11, x11, #42;                                          \
-        mul     x6, x8, x2;                                             \
-        mul     x7, x9, x3;                                             \
-        mul     x2, x10, x2;                                            \
-        mul     x3, x11, x3;                                            \
-        add     x4, x6, x7;                                             \
-        add     x5, x2, x3;                                             \
-        asr     x2, x4, #20;                                            \
-        asr     x3, x5, #20;                                            \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x12, x4, #0x100, lsl #12;                               \
-        sbfx    x12, x12, #21, #21;                                     \
-        mov     x15, #0x100000;                                         \
-        add     x15, x15, x15, lsl #21;                                 \
-        add     x13, x4, x15;                                           \
-        asr     x13, x13, #42;                                          \
-        add     x14, x5, #0x100, lsl #12;                               \
-        sbfx    x14, x14, #21, #21;                                     \
-        add     x15, x5, x15;                                           \
-        asr     x15, x15, #42;                                          \
-        mul     x6, x12, x2;                                            \
-        mul     x7, x13, x3;                                            \
-        mul     x2, x14, x2;                                            \
-        mul     x3, x15, x3;                                            \
-        add     x4, x6, x7;                                             \
-        add     x5, x2, x3;                                             \
-        asr     x2, x4, #20;                                            \
-        asr     x3, x5, #20;                                            \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        mul     x2, x12, x8;                                            \
-        mul     x3, x12, x9;                                            \
-        mul     x6, x14, x8;                                            \
-        mul     x7, x14, x9;                                            \
-        madd    x8, x13, x10, x2;                                       \
-        madd    x9, x13, x11, x3;                                       \
-        madd    x16, x15, x10, x6;                                      \
-        madd    x17, x15, x11, x7;                                      \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x12, x4, #0x100, lsl #12;                               \
-        sbfx    x12, x12, #22, #21;                                     \
-        mov     x15, #0x100000;                                         \
-        add     x15, x15, x15, lsl #21;                                 \
-        add     x13, x4, x15;                                           \
-        asr     x13, x13, #43;                                          \
-        add     x14, x5, #0x100, lsl #12;                               \
-        sbfx    x14, x14, #22, #21;                                     \
-        add     x15, x5, x15;                                           \
-        asr     x15, x15, #43;                                          \
-        mneg    x2, x12, x8;                                            \
-        mneg    x3, x12, x9;                                            \
-        mneg    x4, x14, x8;                                            \
-        mneg    x5, x14, x9;                                            \
-        msub    m00, x13, x16, x2;                                      \
-        msub    m01, x13, x17, x3;                                      \
-        msub    m10, x15, x16, x4;                                      \
-        msub    m11, x15, x17, x5
-
-S2N_BN_SYMBOL(bignum_montinv_p384):
-
-// Save registers and make room for temporaries
-
-        stp     x19, x20, [sp, -16]!
-        stp     x21, x22, [sp, -16]!
-        stp     x23, x24, [sp, -16]!
-        sub     sp, sp, NSPACE
-
-// Save the return pointer for the end so we can overwrite x0 later
-
-        mov     res, x0
-
-// Copy the prime and input into the main f and g variables respectively.
-// Make sure x is reduced so that g <= f as assumed in the bound proof.
-
-        mov     x10, #0x00000000ffffffff
-        mov     x11, #0xffffffff00000000
-        mov     x12, #0xfffffffffffffffe
-        mov     x15, #0xffffffffffffffff
-        stp     x10, x11, [f]
-        stp     x12, x15, [f+2*N]
-        stp     x15, x15, [f+4*N]
-        str     xzr, [f+6*N]
-
-        ldp     x2, x3, [x1]
-        subs    x10, x2, x10
-        sbcs    x11, x3, x11
-        ldp     x4, x5, [x1, #(2*N)]
-        sbcs    x12, x4, x12
-        sbcs    x13, x5, x15
-        ldp     x6, x7, [x1, #(4*N)]
-        sbcs    x14, x6, x15
-        sbcs    x15, x7, x15
-
-        csel    x2, x2, x10, cc
-        csel    x3, x3, x11, cc
-        csel    x4, x4, x12, cc
-        csel    x5, x5, x13, cc
-        csel    x6, x6, x14, cc
-        csel    x7, x7, x15, cc
-
-        stp     x2, x3, [g]
-        stp     x4, x5, [g+2*N]
-        stp     x6, x7, [g+4*N]
-        str     xzr, [g+6*N]
-
-// Also maintain reduced < 2^384 vector [u,v] such that
-// [f,g] == x * 2^{5*i-843} * [u,v] (mod p_384)
-// starting with [p_384,x] == x * 2^{5*0-843} * [0,2^843] (mod p_384)
-// The weird-looking 5*i modifications come in because we are doing
-// 64-bit word-sized Montgomery reductions at each stage, which is
-// 5 bits more than the 59-bit requirement to keep things stable.
-// After the 15th and last iteration and sign adjustment, when
-// f == 1 for in-scope cases, we have x * 2^{75-843} * u == 1, i.e.
-// x * u == 2^768 as required.
-
-        stp     xzr, xzr, [u]
-        stp     xzr, xzr, [u+2*N]
-        stp     xzr, xzr, [u+4*N]
-
-// The starting constant 2^843 mod p_384 is
-// 0x0000000000000800:00001000000007ff:fffff00000000000
-//  :00001000000007ff:fffff00000000800:0000000000000000
-// where colons separate 64-bit subwords, least significant at the right.
-// Not all of these are single loads on ARM so this is a bit dynamic
-
-        mov     x12, #0xfffff00000000000
-        orr     x10, x12, #0x0000000000000800
-        stp     xzr, x10, [v]
-        mov     x11, #0x00000000000007ff
-        orr     x11, x11, #0x0000100000000000
-        stp     x11, x12, [v+2*N]
-        mov     x12, #0x0000000000000800
-        stp     x11, x12, [v+4*N]
-
-// Start of main loop. We jump into the middle so that the divstep
-// portion is common to the special fifteenth iteration after a uniform
-// first 14.
-
-        mov     i, #15
-        mov     d, #1
-        b       bignum_montinv_p384_midloop
-
-bignum_montinv_p384_loop:
-
-// Separate the matrix elements into sign-magnitude pairs
-
-        cmp     m00, xzr
-        csetm   s00, mi
-        cneg    m00, m00, mi
-
-        cmp     m01, xzr
-        csetm   s01, mi
-        cneg    m01, m01, mi
-
-        cmp     m10, xzr
-        csetm   s10, mi
-        cneg    m10, m10, mi
-
-        cmp     m11, xzr
-        csetm   s11, mi
-        cneg    m11, m11, mi
-
-// Adjust the initial values to allow for complement instead of negation
-// This initial offset is the same for [f,g] and [u,v] compositions.
-// Save it in stable registers for the [u,v] part and do [f,g] first.
-
-        and     x0, m00, s00
-        and     x1, m01, s01
-        add     car0, x0, x1
-
-        and     x0, m10, s10
-        and     x1, m11, s11
-        add     car1, x0, x1
-
-// Now the computation of the updated f and g values. This maintains a
-// 2-word carry between stages so we can conveniently insert the shift
-// right by 59 before storing back, and not overwrite digits we need
-// again of the old f and g values.
-//
-// Digit 0 of [f,g]
-
-        ldr     x7, [f]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [g]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, car1, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-
-// Digit 1 of [f,g]
-
-        ldr     x7, [f+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [g+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        adc     x6, x6, x1
-        extr    x4, x2, x4, #59
-        str     x4, [f]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        adc     x4, x4, x1
-        extr    x5, x3, x5, #59
-        str     x5, [g]
-
-// Digit 2 of [f,g]
-
-        ldr     x7, [f+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [g+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        adc     x5, x5, x1
-        extr    x2, x6, x2, #59
-        str     x2, [f+N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        extr    x3, x4, x3, #59
-        str     x3, [g+N]
-
-// Digit 3 of [f,g]
-
-        ldr     x7, [f+3*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        ldr     x8, [g+3*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        extr    x6, x5, x6, #59
-        str     x6, [f+2*N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        adc     x6, x6, x1
-        extr    x4, x2, x4, #59
-        str     x4, [g+2*N]
-
-// Digit 4 of [f,g]
-
-        ldr     x7, [f+4*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        ldr     x8, [g+4*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x3, x3, x0
-        adc     x4, x4, x1
-        extr    x5, x3, x5, #59
-        str     x5, [f+3*N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x6, x6, x0
-        adc     x5, x5, x1
-        extr    x2, x6, x2, #59
-        str     x2, [g+3*N]
-
-// Digits 5 and 6 of [f,g]
-
-        ldr     x7, [f+5*N]
-        eor     x1, x7, s00
-        ldr     x23, [f+6*N]
-        eor     x2, x23, s00
-        and     x2, x2, m00
-        neg     x2, x2
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        ldr     x8, [g+5*N]
-        eor     x1, x8, s01
-        ldr     x24, [g+6*N]
-        eor     x0, x24, s01
-        and     x0, x0, m01
-        sub     x2, x2, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        extr    x3, x4, x3, #59
-        str     x3, [f+4*N]
-        extr    x4, x2, x4, #59
-        str     x4, [f+5*N]
-        asr     x2, x2, #59
-        str     x2, [f+6*N]
-
-        eor     x1, x7, s10
-        eor     x4, x23, s10
-        and     x4, x4, m10
-        neg     x4, x4
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, x5, x0
-        adc     x4, x4, x1
-        eor     x1, x8, s11
-        eor     x0, x24, s11
-        and     x0, x0, m11
-        sub     x4, x4, x0
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        adc     x4, x4, x1
-        extr    x6, x5, x6, #59
-        str     x6, [g+4*N]
-        extr    x5, x4, x5, #59
-        str     x5, [g+5*N]
-        asr     x4, x4, #59
-        str     x4, [g+6*N]
-
-// Now the computation of the updated u and v values and their
-// Montgomery reductions. A very similar accumulation except that
-// the top words of u and v are unsigned and we don't shift.
-//
-// Digit 0 of [u,v]
-
-        ldr     x7, [u]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u]
-        adc     x2, x2, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, car1, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        str     x5, [v]
-        adc     x3, x3, x1
-
-// Digit 1 of [u,v]
-
-        ldr     x7, [u+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+N]
-        adc     x6, x6, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        str     x3, [v+N]
-        adc     x4, x4, x1
-
-// Digit 2 of [u,v]
-
-        ldr     x7, [u+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+2*N]
-        adc     x5, x5, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        str     x4, [v+2*N]
-        adc     x2, x2, x1
-
-// Digit 3 of [u,v]
-
-        ldr     x7, [u+3*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        ldr     x8, [v+3*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        str     x5, [u+3*N]
-        adc     x3, x3, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        str     x2, [v+3*N]
-        adc     x6, x6, x1
-
-// Digit 4 of [u,v]
-
-        ldr     x7, [u+4*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        ldr     x8, [v+4*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x3, x3, x0
-        str     x3, [u+4*N]
-        adc     x4, x4, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x6, x6, x0
-        str     x6, [v+4*N]
-        adc     x5, x5, x1
-
-// Digits 5 and 6 of [u,v] (top is unsigned)
-
-        ldr     x7, [u+5*N]
-        eor     x1, x7, s00
-        and     x2, s00, m00
-        neg     x2, x2
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        ldr     x8, [v+5*N]
-        eor     x1, x8, s01
-        and     x0, s01, m01
-        sub     x2, x2, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u+5*N]
-        adc     x2, x2, x1
-        str     x2, [u+6*N]
-
-        eor     x1, x7, s10
-        and     x4, s10, m10
-        neg     x4, x4
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, x5, x0
-        adc     x4, x4, x1
-        eor     x1, x8, s11
-        and     x0, s11, m11
-        sub     x4, x4, x0
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        str     x5, [v+5*N]
-        adc     x4, x4, x1
-        str     x4, [v+6*N]
-
-// Montgomery reduction of u
-
-        ldp     x0, x1, [u]
-        ldp     x2, x3, [u+16]
-        ldp     x4, x5, [u+32]
-        ldr     x6, [u+48]
-        amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7)
-        stp     x1, x2, [u]
-        stp     x3, x4, [u+16]
-        stp     x5, x6, [u+32]
-
-// Montgomery reduction of v
-
-        ldp     x0, x1, [v]
-        ldp     x2, x3, [v+16]
-        ldp     x4, x5, [v+32]
-        ldr     x6, [v+48]
-        amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7)
-        stp     x1, x2, [v]
-        stp     x3, x4, [v+16]
-        stp     x5, x6, [v+32]
-
-bignum_montinv_p384_midloop:
-
-        mov     x1, d
-        ldr     x2, [f]
-        ldr     x3, [g]
-        divstep59()
-        mov     d, x1
-
-// Next iteration
-
-        subs    i, i, #1
-        bne     bignum_montinv_p384_loop
-
-// The 15th and last iteration does not need anything except the
-// u value and the sign of f; the latter can be obtained from the
-// lowest word of f. So it's done differently from the main loop.
-// Find the sign of the new f. For this we just need one digit
-// since we know (for in-scope cases) that f is either +1 or -1.
-// We don't explicitly shift right by 59 either, but looking at
-// bit 63 (or any bit >= 60) of the unshifted result is enough
-// to distinguish -1 from +1; this is then made into a mask.
-
-        ldr     x0, [f]
-        ldr     x1, [g]
-        mul     x0, x0, m00
-        madd    x1, x1, m01, x0
-        asr     x0, x1, #63
-
-// Now separate out the matrix into sign-magnitude pairs
-// and adjust each one based on the sign of f.
-//
-// Note that at this point we expect |f|=1 and we got its
-// sign above, so then since [f,0] == x * 2^{-768} [u,v] (mod p_384)
-// we want to flip the sign of u according to that of f.
-
-        cmp     m00, xzr
-        csetm   s00, mi
-        cneg    m00, m00, mi
-        eor     s00, s00, x0
-
-        cmp     m01, xzr
-        csetm   s01, mi
-        cneg    m01, m01, mi
-        eor     s01, s01, x0
-
-        cmp     m10, xzr
-        csetm   s10, mi
-        cneg    m10, m10, mi
-        eor     s10, s10, x0
-
-        cmp     m11, xzr
-        csetm   s11, mi
-        cneg    m11, m11, mi
-        eor     s11, s11, x0
-
-// Adjust the initial value to allow for complement instead of negation
-
-        and     x0, m00, s00
-        and     x1, m01, s01
-        add     car0, x0, x1
-
-// Digit 0 of [u]
-
-        ldr     x7, [u]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u]
-        adc     x2, x2, x1
-
-// Digit 1 of [u]
-
-        ldr     x7, [u+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+N]
-        adc     x6, x6, x1
-
-// Digit 2 of [u]
-
-        ldr     x7, [u+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+2*N]
-        adc     x5, x5, x1
-
-// Digit 3 of [u]
-
-        ldr     x7, [u+3*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        ldr     x8, [v+3*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        str     x5, [u+3*N]
-        adc     x3, x3, x1
-
-// Digit 4 of [u]
-
-        ldr     x7, [u+4*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        ldr     x8, [v+4*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x3, x3, x0
-        str     x3, [u+4*N]
-        adc     x4, x4, x1
-
-// Digits 5 and 6 of [u] (top is unsigned)
-
-        ldr     x7, [u+5*N]
-        eor     x1, x7, s00
-        and     x2, s00, m00
-        neg     x2, x2
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        ldr     x8, [v+5*N]
-        eor     x1, x8, s01
-        and     x0, s01, m01
-        sub     x2, x2, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u+5*N]
-        adc     x2, x2, x1
-        str     x2, [u+6*N]
-
-// Montgomery reduction of u. This needs to be strict not "almost"
-// so it is followed by an optional subtraction of p_384
-
-        ldp     x10, x0, [u]
-        ldp     x1, x2, [u+16]
-        ldp     x3, x4, [u+32]
-        ldr     x5, [u+48]
-        amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7)
-
-        mov     x10, #0x00000000ffffffff
-        subs    x10, x0, x10
-        mov     x11, #0xffffffff00000000
-        sbcs    x11, x1, x11
-        mov     x12, #0xfffffffffffffffe
-        sbcs    x12, x2, x12
-        mov     x15, #0xffffffffffffffff
-        sbcs    x13, x3, x15
-        sbcs    x14, x4, x15
-        sbcs    x15, x5, x15
-
-        csel    x0, x0, x10, cc
-        csel    x1, x1, x11, cc
-        csel    x2, x2, x12, cc
-        csel    x3, x3, x13, cc
-        csel    x4, x4, x14, cc
-        csel    x5, x5, x15, cc
-
-// Store it back to the final output
-
-        stp     x0, x1, [res]
-        stp     x2, x3, [res, #16]
-        stp     x4, x5, [res, #32]
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p384/p384_montjadd_alt.S b/third_party/s2n-bignum/arm/p384/p384_montjadd_alt.S
deleted file mode 100644
index b84065dea97..00000000000
--- a/third_party/s2n-bignum/arm/p384/p384_montjadd_alt.S
+++ /dev/null
@@ -1,993 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates
-//
-//    extern void p384_montjadd_alt
-//      (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]);
-//
-// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
-// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384.
-// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
-//
-// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd_alt)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd_alt)
-        .text
-        .balign 4
-
-// Size of individual field elements
-
-#define NUMSIZE 48
-
-// Stable homes for input arguments during main code sequence
-
-#define input_z x24
-#define input_x x25
-#define input_y x26
-
-// Pointer-offset pairs for inputs and outputs
-
-#define x_1 input_x, #0
-#define y_1 input_x, #NUMSIZE
-#define z_1 input_x, #(2*NUMSIZE)
-
-#define x_2 input_y, #0
-#define y_2 input_y, #NUMSIZE
-#define z_2 input_y, #(2*NUMSIZE)
-
-#define x_3 input_z, #0
-#define y_3 input_z, #NUMSIZE
-#define z_3 input_z, #(2*NUMSIZE)
-
-// Pointer-offset pairs for temporaries, with some aliasing
-// NSPACE is the total stack needed for these temporaries
-
-#define z1sq sp, #(NUMSIZE*0)
-#define ww sp, #(NUMSIZE*0)
-#define resx sp, #(NUMSIZE*0)
-
-#define yd sp, #(NUMSIZE*1)
-#define y2a sp, #(NUMSIZE*1)
-
-#define x2a sp, #(NUMSIZE*2)
-#define zzx2 sp, #(NUMSIZE*2)
-
-#define zz sp, #(NUMSIZE*3)
-#define t1 sp, #(NUMSIZE*3)
-
-#define t2 sp, #(NUMSIZE*4)
-#define x1a sp, #(NUMSIZE*4)
-#define zzx1 sp, #(NUMSIZE*4)
-#define resy sp, #(NUMSIZE*4)
-
-#define xd sp, #(NUMSIZE*5)
-#define z2sq sp, #(NUMSIZE*5)
-#define resz sp, #(NUMSIZE*5)
-
-#define y1a sp, #(NUMSIZE*6)
-
-#define NSPACE (NUMSIZE*7)
-
-// Corresponds exactly to bignum_montmul_p384_alt
-
-#define montmul_p384(P0,P1,P2)                  \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        mul     x12, x3, x5;                    \
-        umulh   x13, x3, x5;                    \
-        mul     x11, x3, x6;                    \
-        umulh   x14, x3, x6;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x7, x8, [P2+16];                \
-        mul     x11, x3, x7;                    \
-        umulh   x15, x3, x7;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x8;                    \
-        umulh   x16, x3, x8;                    \
-        adcs    x15, x15, x11;                  \
-        ldp     x9, x10, [P2+32];               \
-        mul     x11, x3, x9;                    \
-        umulh   x17, x3, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x19, x3, x10;                   \
-        adcs    x17, x17, x11;                  \
-        adc     x19, x19, xzr;                  \
-        mul     x11, x4, x5;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x6;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x7;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x19, x19, x11;                  \
-        cset    x20, cs;                        \
-        umulh   x11, x4, x5;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x6;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x7;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x17, x17, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x4, x10;                   \
-        adc     x20, x20, x11;                  \
-        ldp     x3, x4, [P1+16];                \
-        mul     x11, x3, x5;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x3, x6;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x3, x7;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x3, x8;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x3, x9;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x3, x10;                   \
-        adcs    x20, x20, x11;                  \
-        cset    x21, cs;                        \
-        umulh   x11, x3, x5;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x3, x6;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x3, x7;                    \
-        adcs    x17, x17, x11;                  \
-        umulh   x11, x3, x8;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x3, x9;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x3, x10;                   \
-        adc     x21, x21, x11;                  \
-        mul     x11, x4, x5;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x4, x6;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x4, x7;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x20, x20, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x21, x21, x11;                  \
-        cset    x22, cs;                        \
-        umulh   x11, x4, x5;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x4, x6;                    \
-        adcs    x17, x17, x11;                  \
-        umulh   x11, x4, x7;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x21, x21, x11;                  \
-        umulh   x11, x4, x10;                   \
-        adc     x22, x22, x11;                  \
-        ldp     x3, x4, [P1+32];                \
-        mul     x11, x3, x5;                    \
-        adds    x16, x16, x11;                  \
-        mul     x11, x3, x6;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x3, x7;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x3, x8;                    \
-        adcs    x20, x20, x11;                  \
-        mul     x11, x3, x9;                    \
-        adcs    x21, x21, x11;                  \
-        mul     x11, x3, x10;                   \
-        adcs    x22, x22, x11;                  \
-        cset    x2, cs;                         \
-        umulh   x11, x3, x5;                    \
-        adds    x17, x17, x11;                  \
-        umulh   x11, x3, x6;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x3, x7;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x3, x8;                    \
-        adcs    x21, x21, x11;                  \
-        umulh   x11, x3, x9;                    \
-        adcs    x22, x22, x11;                  \
-        umulh   x11, x3, x10;                   \
-        adc     x2, x2, x11;                    \
-        mul     x11, x4, x5;                    \
-        adds    x17, x17, x11;                  \
-        mul     x11, x4, x6;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x4, x7;                    \
-        adcs    x20, x20, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x21, x21, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x22, x22, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x2, x2, x11;                    \
-        cset    x1, cs;                         \
-        umulh   x11, x4, x5;                    \
-        adds    x19, x19, x11;                  \
-        umulh   x11, x4, x6;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x4, x7;                    \
-        adcs    x21, x21, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x22, x22, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x2, x2, x11;                    \
-        umulh   x11, x4, x10;                   \
-        adc     x1, x1, x11;                    \
-        lsl     x7, x12, #32;                   \
-        add     x12, x7, x12;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x12;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x12;                    \
-        umulh   x6, x6, x12;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x12;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x13, x13, x7;                   \
-        sbcs    x14, x14, x6;                   \
-        sbcs    x15, x15, x5;                   \
-        sbcs    x16, x16, xzr;                  \
-        sbcs    x17, x17, xzr;                  \
-        sbc     x12, x12, xzr;                  \
-        lsl     x7, x13, #32;                   \
-        add     x13, x7, x13;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x13;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x13;                    \
-        umulh   x6, x6, x13;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x13;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x14, x14, x7;                   \
-        sbcs    x15, x15, x6;                   \
-        sbcs    x16, x16, x5;                   \
-        sbcs    x17, x17, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        lsl     x7, x14, #32;                   \
-        add     x14, x7, x14;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x14;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x14;                    \
-        umulh   x6, x6, x14;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x14;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x15, x15, x7;                   \
-        sbcs    x16, x16, x6;                   \
-        sbcs    x17, x17, x5;                   \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x14, x14, xzr;                  \
-        lsl     x7, x15, #32;                   \
-        add     x15, x7, x15;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x15;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x15;                    \
-        umulh   x6, x6, x15;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x15;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x16, x16, x7;                   \
-        sbcs    x17, x17, x6;                   \
-        sbcs    x12, x12, x5;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbc     x15, x15, xzr;                  \
-        lsl     x7, x16, #32;                   \
-        add     x16, x7, x16;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x16;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x16;                    \
-        umulh   x6, x6, x16;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x16;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x17, x17, x7;                   \
-        sbcs    x12, x12, x6;                   \
-        sbcs    x13, x13, x5;                   \
-        sbcs    x14, x14, xzr;                  \
-        sbcs    x15, x15, xzr;                  \
-        sbc     x16, x16, xzr;                  \
-        lsl     x7, x17, #32;                   \
-        add     x17, x7, x17;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x17;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x17;                    \
-        umulh   x6, x6, x17;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x17;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x12, x12, x7;                   \
-        sbcs    x13, x13, x6;                   \
-        sbcs    x14, x14, x5;                   \
-        sbcs    x15, x15, xzr;                  \
-        sbcs    x16, x16, xzr;                  \
-        sbc     x17, x17, xzr;                  \
-        adds    x12, x12, x19;                  \
-        adcs    x13, x13, x20;                  \
-        adcs    x14, x14, x21;                  \
-        adcs    x15, x15, x22;                  \
-        adcs    x16, x16, x2;                   \
-        adcs    x17, x17, x1;                   \
-        adc     x10, xzr, xzr;                  \
-        mov     x11, #0xffffffff00000001;       \
-        adds    x19, x12, x11;                  \
-        mov     x11, #0xffffffff;               \
-        adcs    x20, x13, x11;                  \
-        mov     x11, #0x1;                      \
-        adcs    x21, x14, x11;                  \
-        adcs    x22, x15, xzr;                  \
-        adcs    x2, x16, xzr;                   \
-        adcs    x1, x17, xzr;                   \
-        adcs    x10, x10, xzr;                  \
-        csel    x12, x12, x19, eq;              \
-        csel    x13, x13, x20, eq;              \
-        csel    x14, x14, x21, eq;              \
-        csel    x15, x15, x22, eq;              \
-        csel    x16, x16, x2, eq;               \
-        csel    x17, x17, x1, eq;               \
-        stp     x12, x13, [P0];                 \
-        stp     x14, x15, [P0+16];              \
-        stp     x16, x17, [P0+32]
-
-// Corresponds exactly to bignum_montsqr_p384_alt
-
-#define montsqr_p384(P0,P1)                     \
-        ldp     x2, x3, [P1];                   \
-        mul     x9, x2, x3;                     \
-        umulh   x10, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x8, x2, x4;                     \
-        adds    x10, x10, x8;                   \
-        mul     x11, x2, x5;                    \
-        mul     x8, x3, x4;                     \
-        adcs    x11, x11, x8;                   \
-        umulh   x12, x2, x5;                    \
-        mul     x8, x3, x5;                     \
-        adcs    x12, x12, x8;                   \
-        ldp     x6, x7, [P1+32];                \
-        mul     x13, x2, x7;                    \
-        mul     x8, x3, x6;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x14, x2, x7;                    \
-        mul     x8, x3, x7;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x15, x5, x6;                    \
-        adcs    x15, x15, xzr;                  \
-        umulh   x16, x5, x6;                    \
-        adc     x16, x16, xzr;                  \
-        umulh   x8, x2, x4;                     \
-        adds    x11, x11, x8;                   \
-        umulh   x8, x3, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x3, x5;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x8, x3, x6;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x3, x7;                     \
-        adcs    x15, x15, x8;                   \
-        adc     x16, x16, xzr;                  \
-        mul     x8, x2, x6;                     \
-        adds    x12, x12, x8;                   \
-        mul     x8, x4, x5;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x4, x6;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x8, x4, x7;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x5, x7;                     \
-        adcs    x16, x16, x8;                   \
-        mul     x17, x6, x7;                    \
-        adcs    x17, x17, xzr;                  \
-        umulh   x19, x6, x7;                    \
-        adc     x19, x19, xzr;                  \
-        umulh   x8, x2, x6;                     \
-        adds    x13, x13, x8;                   \
-        umulh   x8, x4, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x4, x6;                     \
-        adcs    x15, x15, x8;                   \
-        umulh   x8, x4, x7;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x5, x7;                     \
-        adcs    x17, x17, x8;                   \
-        adc     x19, x19, xzr;                  \
-        adds    x9, x9, x9;                     \
-        adcs    x10, x10, x10;                  \
-        adcs    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adcs    x17, x17, x17;                  \
-        adcs    x19, x19, x19;                  \
-        cset    x20, hs;                        \
-        umulh   x8, x2, x2;                     \
-        mul     x2, x2, x2;                     \
-        adds    x9, x9, x8;                     \
-        mul     x8, x3, x3;                     \
-        adcs    x10, x10, x8;                   \
-        umulh   x8, x3, x3;                     \
-        adcs    x11, x11, x8;                   \
-        mul     x8, x4, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x4, x4;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x5, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x5, x5;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x6, x6;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x6, x6;                     \
-        adcs    x17, x17, x8;                   \
-        mul     x8, x7, x7;                     \
-        adcs    x19, x19, x8;                   \
-        umulh   x8, x7, x7;                     \
-        adc     x20, x20, x8;                   \
-        lsl     x5, x2, #32;                    \
-        add     x2, x5, x2;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x2;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x2;                     \
-        umulh   x4, x4, x2;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x2;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x9, x9, x5;                     \
-        sbcs    x10, x10, x4;                   \
-        sbcs    x11, x11, x3;                   \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x2, x2, xzr;                    \
-        lsl     x5, x9, #32;                    \
-        add     x9, x5, x9;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x9;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x9;                     \
-        umulh   x4, x4, x9;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x9;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x10, x10, x5;                   \
-        sbcs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x2, x2, xzr;                    \
-        sbc     x9, x9, xzr;                    \
-        lsl     x5, x10, #32;                   \
-        add     x10, x5, x10;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x10;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x10;                    \
-        umulh   x4, x4, x10;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x10;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x11, x11, x5;                   \
-        sbcs    x12, x12, x4;                   \
-        sbcs    x13, x13, x3;                   \
-        sbcs    x2, x2, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        lsl     x5, x11, #32;                   \
-        add     x11, x5, x11;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x11;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x11;                    \
-        umulh   x4, x4, x11;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x11;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x12, x12, x5;                   \
-        sbcs    x13, x13, x4;                   \
-        sbcs    x2, x2, x3;                     \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbc     x11, x11, xzr;                  \
-        lsl     x5, x12, #32;                   \
-        add     x12, x5, x12;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x12;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x12;                    \
-        umulh   x4, x4, x12;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x12;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x13, x13, x5;                   \
-        sbcs    x2, x2, x4;                     \
-        sbcs    x9, x9, x3;                     \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbc     x12, x12, xzr;                  \
-        lsl     x5, x13, #32;                   \
-        add     x13, x5, x13;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x13;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x13;                    \
-        umulh   x4, x4, x13;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x13;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x2, x2, x5;                     \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        adds    x2, x2, x14;                    \
-        adcs    x9, x9, x15;                    \
-        adcs    x10, x10, x16;                  \
-        adcs    x11, x11, x17;                  \
-        adcs    x12, x12, x19;                  \
-        adcs    x13, x13, x20;                  \
-        adc     x6, xzr, xzr;                   \
-        mov     x8, #-4294967295;               \
-        adds    x14, x2, x8;                    \
-        mov     x8, #4294967295;                \
-        adcs    x15, x9, x8;                    \
-        mov     x8, #1;                         \
-        adcs    x16, x10, x8;                   \
-        adcs    x17, x11, xzr;                  \
-        adcs    x19, x12, xzr;                  \
-        adcs    x20, x13, xzr;                  \
-        adcs    x6, x6, xzr;                    \
-        csel    x2, x2, x14, eq;                \
-        csel    x9, x9, x15, eq;                \
-        csel    x10, x10, x16, eq;              \
-        csel    x11, x11, x17, eq;              \
-        csel    x12, x12, x19, eq;              \
-        csel    x13, x13, x20, eq;              \
-        stp     x2, x9, [P0];                   \
-        stp     x10, x11, [P0+16];              \
-        stp     x12, x13, [P0+32]
-
-// Almost-Montgomery variant which we use when an input to other muls
-// with the other argument fully reduced (which is always safe). In
-// fact, with the Karatsuba-based Montgomery mul here, we don't even
-// *need* the restriction that the other argument is reduced.
-
-#define amontsqr_p384(P0,P1)                    \
-        ldp     x2, x3, [P1];                   \
-        mul     x9, x2, x3;                     \
-        umulh   x10, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x8, x2, x4;                     \
-        adds    x10, x10, x8;                   \
-        mul     x11, x2, x5;                    \
-        mul     x8, x3, x4;                     \
-        adcs    x11, x11, x8;                   \
-        umulh   x12, x2, x5;                    \
-        mul     x8, x3, x5;                     \
-        adcs    x12, x12, x8;                   \
-        ldp     x6, x7, [P1+32];                \
-        mul     x13, x2, x7;                    \
-        mul     x8, x3, x6;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x14, x2, x7;                    \
-        mul     x8, x3, x7;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x15, x5, x6;                    \
-        adcs    x15, x15, xzr;                  \
-        umulh   x16, x5, x6;                    \
-        adc     x16, x16, xzr;                  \
-        umulh   x8, x2, x4;                     \
-        adds    x11, x11, x8;                   \
-        umulh   x8, x3, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x3, x5;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x8, x3, x6;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x3, x7;                     \
-        adcs    x15, x15, x8;                   \
-        adc     x16, x16, xzr;                  \
-        mul     x8, x2, x6;                     \
-        adds    x12, x12, x8;                   \
-        mul     x8, x4, x5;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x4, x6;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x8, x4, x7;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x5, x7;                     \
-        adcs    x16, x16, x8;                   \
-        mul     x17, x6, x7;                    \
-        adcs    x17, x17, xzr;                  \
-        umulh   x19, x6, x7;                    \
-        adc     x19, x19, xzr;                  \
-        umulh   x8, x2, x6;                     \
-        adds    x13, x13, x8;                   \
-        umulh   x8, x4, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x4, x6;                     \
-        adcs    x15, x15, x8;                   \
-        umulh   x8, x4, x7;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x5, x7;                     \
-        adcs    x17, x17, x8;                   \
-        adc     x19, x19, xzr;                  \
-        adds    x9, x9, x9;                     \
-        adcs    x10, x10, x10;                  \
-        adcs    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adcs    x17, x17, x17;                  \
-        adcs    x19, x19, x19;                  \
-        cset    x20, hs;                        \
-        umulh   x8, x2, x2;                     \
-        mul     x2, x2, x2;                     \
-        adds    x9, x9, x8;                     \
-        mul     x8, x3, x3;                     \
-        adcs    x10, x10, x8;                   \
-        umulh   x8, x3, x3;                     \
-        adcs    x11, x11, x8;                   \
-        mul     x8, x4, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x4, x4;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x5, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x5, x5;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x6, x6;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x6, x6;                     \
-        adcs    x17, x17, x8;                   \
-        mul     x8, x7, x7;                     \
-        adcs    x19, x19, x8;                   \
-        umulh   x8, x7, x7;                     \
-        adc     x20, x20, x8;                   \
-        lsl     x5, x2, #32;                    \
-        add     x2, x5, x2;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x2;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x2;                     \
-        umulh   x4, x4, x2;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x2;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x9, x9, x5;                     \
-        sbcs    x10, x10, x4;                   \
-        sbcs    x11, x11, x3;                   \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x2, x2, xzr;                    \
-        lsl     x5, x9, #32;                    \
-        add     x9, x5, x9;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x9;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x9;                     \
-        umulh   x4, x4, x9;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x9;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x10, x10, x5;                   \
-        sbcs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x2, x2, xzr;                    \
-        sbc     x9, x9, xzr;                    \
-        lsl     x5, x10, #32;                   \
-        add     x10, x5, x10;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x10;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x10;                    \
-        umulh   x4, x4, x10;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x10;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x11, x11, x5;                   \
-        sbcs    x12, x12, x4;                   \
-        sbcs    x13, x13, x3;                   \
-        sbcs    x2, x2, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        lsl     x5, x11, #32;                   \
-        add     x11, x5, x11;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x11;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x11;                    \
-        umulh   x4, x4, x11;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x11;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x12, x12, x5;                   \
-        sbcs    x13, x13, x4;                   \
-        sbcs    x2, x2, x3;                     \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbc     x11, x11, xzr;                  \
-        lsl     x5, x12, #32;                   \
-        add     x12, x5, x12;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x12;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x12;                    \
-        umulh   x4, x4, x12;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x12;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x13, x13, x5;                   \
-        sbcs    x2, x2, x4;                     \
-        sbcs    x9, x9, x3;                     \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbc     x12, x12, xzr;                  \
-        lsl     x5, x13, #32;                   \
-        add     x13, x5, x13;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x13;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x13;                    \
-        umulh   x4, x4, x13;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x13;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x2, x2, x5;                     \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        adds    x2, x2, x14;                    \
-        adcs    x9, x9, x15;                    \
-        adcs    x10, x10, x16;                  \
-        adcs    x11, x11, x17;                  \
-        adcs    x12, x12, x19;                  \
-        adcs    x13, x13, x20;                  \
-        mov     x14, #-4294967295;              \
-        mov     x15, #4294967295;               \
-        csel    x14, x14, xzr, cs;              \
-        csel    x15, x15, xzr, cs;              \
-        cset    x16, cs;                        \
-        adds    x2, x2, x14;                    \
-        adcs    x9, x9, x15;                    \
-        adcs    x10, x10, x16;                  \
-        adcs    x11, x11, xzr;                  \
-        adcs    x12, x12, xzr;                  \
-        adc     x13, x13, xzr;                  \
-        stp     x2, x9, [P0];                   \
-        stp     x10, x11, [P0+16];              \
-        stp     x12, x13, [P0+32]
-
-// Corresponds exactly to bignum_sub_p384
-
-#define sub_p384(P0,P1,P2)                      \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        csetm   x3, lo;                         \
-        mov     x4, #4294967295;                \
-        and     x4, x4, x3;                     \
-        adds    x5, x5, x4;                     \
-        eor     x4, x4, x3;                     \
-        adcs    x6, x6, x4;                     \
-        mov     x4, #-2;                        \
-        and     x4, x4, x3;                     \
-        adcs    x7, x7, x4;                     \
-        adcs    x8, x8, x3;                     \
-        adcs    x9, x9, x3;                     \
-        adc     x10, x10, x3;                   \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32]
-
-S2N_BN_SYMBOL(p384_montjadd_alt):
-
-// Save regs and make room on stack for temporary variables
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x26, [sp, #-16]!
-        sub     sp, sp, NSPACE
-
-// Move the input arguments to stable places
-
-        mov     input_z, x0
-        mov     input_x, x1
-        mov     input_y, x2
-
-// Main code, just a sequence of basic field operations
-// 8 * multiply + 3 * square + 7 * subtract
-
-        amontsqr_p384(z1sq,z_1)
-        amontsqr_p384(z2sq,z_2)
-
-        montmul_p384(y1a,z_2,y_1)
-        montmul_p384(y2a,z_1,y_2)
-
-        montmul_p384(x2a,z1sq,x_2)
-        montmul_p384(x1a,z2sq,x_1)
-        montmul_p384(y2a,z1sq,y2a)
-        montmul_p384(y1a,z2sq,y1a)
-
-        sub_p384(xd,x2a,x1a)
-        sub_p384(yd,y2a,y1a)
-
-        amontsqr_p384(zz,xd)
-        montsqr_p384(ww,yd)
-
-        montmul_p384(zzx1,zz,x1a)
-        montmul_p384(zzx2,zz,x2a)
-
-        sub_p384(resx,ww,zzx1)
-        sub_p384(t1,zzx2,zzx1)
-
-        montmul_p384(xd,xd,z_1)
-
-        sub_p384(resx,resx,zzx2)
-
-        sub_p384(t2,zzx1,resx)
-
-        montmul_p384(t1,t1,y1a)
-        montmul_p384(resz,xd,z_2)
-        montmul_p384(t2,yd,t2)
-
-        sub_p384(resy,t2,t1)
-
-// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
-// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
-// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
-// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
-
-        ldp     x0, x1, [z_1]
-        ldp     x2, x3, [z_1+16]
-        ldp     x4, x5, [z_1+32]
-
-        orr     x20, x0, x1
-        orr     x21, x2, x3
-        orr     x22, x4, x5
-        orr     x20, x20, x21
-        orr     x20, x20, x22
-        cmp     x20, xzr
-        cset    x20, ne
-
-        ldp     x6, x7, [z_2]
-        ldp     x8, x9, [z_2+16]
-        ldp     x10, x11, [z_2+32]
-
-        orr     x21, x6, x7
-        orr     x22, x8, x9
-        orr     x23, x10, x11
-        orr     x21, x21, x22
-        orr     x21, x21, x23
-        cmp     x21, xzr
-        cset    x21, ne
-
-        cmp     x21, x20
-
-// Multiplex the outputs accordingly, re-using the z's in registers
-
-        ldp     x12, x13, [resz]
-        csel    x12, x0, x12, lo
-        csel    x13, x1, x13, lo
-        csel    x12, x6, x12, hi
-        csel    x13, x7, x13, hi
-        ldp     x14, x15, [resz+16]
-        csel    x14, x2, x14, lo
-        csel    x15, x3, x15, lo
-        csel    x14, x8, x14, hi
-        csel    x15, x9, x15, hi
-        ldp     x16, x17, [resz+32]
-        csel    x16, x4, x16, lo
-        csel    x17, x5, x17, lo
-        csel    x16, x10, x16, hi
-        csel    x17, x11, x17, hi
-
-        ldp     x20, x21, [x_1]
-        ldp     x0, x1, [resx]
-        csel    x0, x20, x0, lo
-        csel    x1, x21, x1, lo
-        ldp     x20, x21, [x_2]
-        csel    x0, x20, x0, hi
-        csel    x1, x21, x1, hi
-
-        ldp     x20, x21, [x_1+16]
-        ldp     x2, x3, [resx+16]
-        csel    x2, x20, x2, lo
-        csel    x3, x21, x3, lo
-        ldp     x20, x21, [x_2+16]
-        csel    x2, x20, x2, hi
-        csel    x3, x21, x3, hi
-
-        ldp     x20, x21, [x_1+32]
-        ldp     x4, x5, [resx+32]
-        csel    x4, x20, x4, lo
-        csel    x5, x21, x5, lo
-        ldp     x20, x21, [x_2+32]
-        csel    x4, x20, x4, hi
-        csel    x5, x21, x5, hi
-
-        ldp     x20, x21, [y_1]
-        ldp     x6, x7, [resy]
-        csel    x6, x20, x6, lo
-        csel    x7, x21, x7, lo
-        ldp     x20, x21, [y_2]
-        csel    x6, x20, x6, hi
-        csel    x7, x21, x7, hi
-
-        ldp     x20, x21, [y_1+16]
-        ldp     x8, x9, [resy+16]
-        csel    x8, x20, x8, lo
-        csel    x9, x21, x9, lo
-        ldp     x20, x21, [y_2+16]
-        csel    x8, x20, x8, hi
-        csel    x9, x21, x9, hi
-
-        ldp     x20, x21, [y_1+32]
-        ldp     x10, x11, [resy+32]
-        csel    x10, x20, x10, lo
-        csel    x11, x21, x11, lo
-        ldp     x20, x21, [y_2+32]
-        csel    x10, x20, x10, hi
-        csel    x11, x21, x11, hi
-
-// Finally store back the multiplexed values
-
-        stp     x0, x1, [x_3]
-        stp     x2, x3, [x_3+16]
-        stp     x4, x5, [x_3+32]
-        stp     x6, x7, [y_3]
-        stp     x8, x9, [y_3+16]
-        stp     x10, x11, [y_3+32]
-        stp     x12, x13, [z_3]
-        stp     x14, x15, [z_3+16]
-        stp     x16, x17, [z_3+32]
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-
-        ldp     x25, x26, [sp], 16
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p384/p384_montjdouble_alt.S b/third_party/s2n-bignum/arm/p384/p384_montjdouble_alt.S
deleted file mode 100644
index 0e83ff4a986..00000000000
--- a/third_party/s2n-bignum/arm/p384/p384_montjdouble_alt.S
+++ /dev/null
@@ -1,951 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates
-//
-//    extern void p384_montjdouble_alt
-//      (uint64_t p3[static 18],uint64_t p1[static 18]);
-//
-// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
-// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384.
-// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
-//
-// Standard ARM ABI: X0 = p3, X1 = p1
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble_alt)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble_alt)
-        .text
-        .balign 4
-
-// Size of individual field elements
-
-#define NUMSIZE 48
-
-// Stable homes for input arguments during main code sequence
-
-#define input_z x23
-#define input_x x24
-
-// Pointer-offset pairs for inputs and outputs
-
-#define x_1 input_x, #0
-#define y_1 input_x, #NUMSIZE
-#define z_1 input_x, #(2*NUMSIZE)
-
-#define x_3 input_z, #0
-#define y_3 input_z, #NUMSIZE
-#define z_3 input_z, #(2*NUMSIZE)
-
-// Pointer-offset pairs for temporaries, with some aliasing
-// NSPACE is the total stack needed for these temporaries
-
-#define z2 sp, #(NUMSIZE*0)
-#define y2 sp, #(NUMSIZE*1)
-#define x2p sp, #(NUMSIZE*2)
-#define xy2 sp, #(NUMSIZE*3)
-
-#define y4 sp, #(NUMSIZE*4)
-#define t2 sp, #(NUMSIZE*4)
-
-#define dx2 sp, #(NUMSIZE*5)
-#define t1 sp, #(NUMSIZE*5)
-
-#define d sp, #(NUMSIZE*6)
-#define x4p sp, #(NUMSIZE*6)
-
-#define NSPACE (NUMSIZE*7)
-
-// Corresponds exactly to bignum_montmul_p384_alt
-
-#define montmul_p384(P0,P1,P2)                  \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        mul     x12, x3, x5;                    \
-        umulh   x13, x3, x5;                    \
-        mul     x11, x3, x6;                    \
-        umulh   x14, x3, x6;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x7, x8, [P2+16];                \
-        mul     x11, x3, x7;                    \
-        umulh   x15, x3, x7;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x8;                    \
-        umulh   x16, x3, x8;                    \
-        adcs    x15, x15, x11;                  \
-        ldp     x9, x10, [P2+32];               \
-        mul     x11, x3, x9;                    \
-        umulh   x17, x3, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x19, x3, x10;                   \
-        adcs    x17, x17, x11;                  \
-        adc     x19, x19, xzr;                  \
-        mul     x11, x4, x5;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x6;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x7;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x19, x19, x11;                  \
-        cset    x20, cs;                        \
-        umulh   x11, x4, x5;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x6;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x7;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x17, x17, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x4, x10;                   \
-        adc     x20, x20, x11;                  \
-        ldp     x3, x4, [P1+16];                \
-        mul     x11, x3, x5;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x3, x6;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x3, x7;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x3, x8;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x3, x9;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x3, x10;                   \
-        adcs    x20, x20, x11;                  \
-        cset    x21, cs;                        \
-        umulh   x11, x3, x5;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x3, x6;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x3, x7;                    \
-        adcs    x17, x17, x11;                  \
-        umulh   x11, x3, x8;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x3, x9;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x3, x10;                   \
-        adc     x21, x21, x11;                  \
-        mul     x11, x4, x5;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x4, x6;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x4, x7;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x20, x20, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x21, x21, x11;                  \
-        cset    x22, cs;                        \
-        umulh   x11, x4, x5;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x4, x6;                    \
-        adcs    x17, x17, x11;                  \
-        umulh   x11, x4, x7;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x21, x21, x11;                  \
-        umulh   x11, x4, x10;                   \
-        adc     x22, x22, x11;                  \
-        ldp     x3, x4, [P1+32];                \
-        mul     x11, x3, x5;                    \
-        adds    x16, x16, x11;                  \
-        mul     x11, x3, x6;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x3, x7;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x3, x8;                    \
-        adcs    x20, x20, x11;                  \
-        mul     x11, x3, x9;                    \
-        adcs    x21, x21, x11;                  \
-        mul     x11, x3, x10;                   \
-        adcs    x22, x22, x11;                  \
-        cset    x2, cs;                         \
-        umulh   x11, x3, x5;                    \
-        adds    x17, x17, x11;                  \
-        umulh   x11, x3, x6;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x3, x7;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x3, x8;                    \
-        adcs    x21, x21, x11;                  \
-        umulh   x11, x3, x9;                    \
-        adcs    x22, x22, x11;                  \
-        umulh   x11, x3, x10;                   \
-        adc     x2, x2, x11;                    \
-        mul     x11, x4, x5;                    \
-        adds    x17, x17, x11;                  \
-        mul     x11, x4, x6;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x4, x7;                    \
-        adcs    x20, x20, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x21, x21, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x22, x22, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x2, x2, x11;                    \
-        cset    x1, cs;                         \
-        umulh   x11, x4, x5;                    \
-        adds    x19, x19, x11;                  \
-        umulh   x11, x4, x6;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x4, x7;                    \
-        adcs    x21, x21, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x22, x22, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x2, x2, x11;                    \
-        umulh   x11, x4, x10;                   \
-        adc     x1, x1, x11;                    \
-        lsl     x7, x12, #32;                   \
-        add     x12, x7, x12;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x12;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x12;                    \
-        umulh   x6, x6, x12;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x12;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x13, x13, x7;                   \
-        sbcs    x14, x14, x6;                   \
-        sbcs    x15, x15, x5;                   \
-        sbcs    x16, x16, xzr;                  \
-        sbcs    x17, x17, xzr;                  \
-        sbc     x12, x12, xzr;                  \
-        lsl     x7, x13, #32;                   \
-        add     x13, x7, x13;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x13;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x13;                    \
-        umulh   x6, x6, x13;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x13;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x14, x14, x7;                   \
-        sbcs    x15, x15, x6;                   \
-        sbcs    x16, x16, x5;                   \
-        sbcs    x17, x17, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        lsl     x7, x14, #32;                   \
-        add     x14, x7, x14;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x14;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x14;                    \
-        umulh   x6, x6, x14;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x14;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x15, x15, x7;                   \
-        sbcs    x16, x16, x6;                   \
-        sbcs    x17, x17, x5;                   \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x14, x14, xzr;                  \
-        lsl     x7, x15, #32;                   \
-        add     x15, x7, x15;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x15;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x15;                    \
-        umulh   x6, x6, x15;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x15;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x16, x16, x7;                   \
-        sbcs    x17, x17, x6;                   \
-        sbcs    x12, x12, x5;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbc     x15, x15, xzr;                  \
-        lsl     x7, x16, #32;                   \
-        add     x16, x7, x16;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x16;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x16;                    \
-        umulh   x6, x6, x16;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x16;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x17, x17, x7;                   \
-        sbcs    x12, x12, x6;                   \
-        sbcs    x13, x13, x5;                   \
-        sbcs    x14, x14, xzr;                  \
-        sbcs    x15, x15, xzr;                  \
-        sbc     x16, x16, xzr;                  \
-        lsl     x7, x17, #32;                   \
-        add     x17, x7, x17;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x17;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x17;                    \
-        umulh   x6, x6, x17;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x17;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x12, x12, x7;                   \
-        sbcs    x13, x13, x6;                   \
-        sbcs    x14, x14, x5;                   \
-        sbcs    x15, x15, xzr;                  \
-        sbcs    x16, x16, xzr;                  \
-        sbc     x17, x17, xzr;                  \
-        adds    x12, x12, x19;                  \
-        adcs    x13, x13, x20;                  \
-        adcs    x14, x14, x21;                  \
-        adcs    x15, x15, x22;                  \
-        adcs    x16, x16, x2;                   \
-        adcs    x17, x17, x1;                   \
-        adc     x10, xzr, xzr;                  \
-        mov     x11, #0xffffffff00000001;       \
-        adds    x19, x12, x11;                  \
-        mov     x11, #0xffffffff;               \
-        adcs    x20, x13, x11;                  \
-        mov     x11, #0x1;                      \
-        adcs    x21, x14, x11;                  \
-        adcs    x22, x15, xzr;                  \
-        adcs    x2, x16, xzr;                   \
-        adcs    x1, x17, xzr;                   \
-        adcs    x10, x10, xzr;                  \
-        csel    x12, x12, x19, eq;              \
-        csel    x13, x13, x20, eq;              \
-        csel    x14, x14, x21, eq;              \
-        csel    x15, x15, x22, eq;              \
-        csel    x16, x16, x2, eq;               \
-        csel    x17, x17, x1, eq;               \
-        stp     x12, x13, [P0];                 \
-        stp     x14, x15, [P0+16];              \
-        stp     x16, x17, [P0+32]
-
-// Corresponds exactly to bignum_montsqr_p384_alt
-
-#define montsqr_p384(P0,P1)                     \
-        ldp     x2, x3, [P1];                   \
-        mul     x9, x2, x3;                     \
-        umulh   x10, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x8, x2, x4;                     \
-        adds    x10, x10, x8;                   \
-        mul     x11, x2, x5;                    \
-        mul     x8, x3, x4;                     \
-        adcs    x11, x11, x8;                   \
-        umulh   x12, x2, x5;                    \
-        mul     x8, x3, x5;                     \
-        adcs    x12, x12, x8;                   \
-        ldp     x6, x7, [P1+32];                \
-        mul     x13, x2, x7;                    \
-        mul     x8, x3, x6;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x14, x2, x7;                    \
-        mul     x8, x3, x7;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x15, x5, x6;                    \
-        adcs    x15, x15, xzr;                  \
-        umulh   x16, x5, x6;                    \
-        adc     x16, x16, xzr;                  \
-        umulh   x8, x2, x4;                     \
-        adds    x11, x11, x8;                   \
-        umulh   x8, x3, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x3, x5;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x8, x3, x6;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x3, x7;                     \
-        adcs    x15, x15, x8;                   \
-        adc     x16, x16, xzr;                  \
-        mul     x8, x2, x6;                     \
-        adds    x12, x12, x8;                   \
-        mul     x8, x4, x5;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x4, x6;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x8, x4, x7;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x5, x7;                     \
-        adcs    x16, x16, x8;                   \
-        mul     x17, x6, x7;                    \
-        adcs    x17, x17, xzr;                  \
-        umulh   x19, x6, x7;                    \
-        adc     x19, x19, xzr;                  \
-        umulh   x8, x2, x6;                     \
-        adds    x13, x13, x8;                   \
-        umulh   x8, x4, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x4, x6;                     \
-        adcs    x15, x15, x8;                   \
-        umulh   x8, x4, x7;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x5, x7;                     \
-        adcs    x17, x17, x8;                   \
-        adc     x19, x19, xzr;                  \
-        adds    x9, x9, x9;                     \
-        adcs    x10, x10, x10;                  \
-        adcs    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adcs    x17, x17, x17;                  \
-        adcs    x19, x19, x19;                  \
-        cset    x20, hs;                        \
-        umulh   x8, x2, x2;                     \
-        mul     x2, x2, x2;                     \
-        adds    x9, x9, x8;                     \
-        mul     x8, x3, x3;                     \
-        adcs    x10, x10, x8;                   \
-        umulh   x8, x3, x3;                     \
-        adcs    x11, x11, x8;                   \
-        mul     x8, x4, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x4, x4;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x5, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x5, x5;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x6, x6;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x6, x6;                     \
-        adcs    x17, x17, x8;                   \
-        mul     x8, x7, x7;                     \
-        adcs    x19, x19, x8;                   \
-        umulh   x8, x7, x7;                     \
-        adc     x20, x20, x8;                   \
-        lsl     x5, x2, #32;                    \
-        add     x2, x5, x2;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x2;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x2;                     \
-        umulh   x4, x4, x2;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x2;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x9, x9, x5;                     \
-        sbcs    x10, x10, x4;                   \
-        sbcs    x11, x11, x3;                   \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x2, x2, xzr;                    \
-        lsl     x5, x9, #32;                    \
-        add     x9, x5, x9;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x9;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x9;                     \
-        umulh   x4, x4, x9;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x9;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x10, x10, x5;                   \
-        sbcs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x2, x2, xzr;                    \
-        sbc     x9, x9, xzr;                    \
-        lsl     x5, x10, #32;                   \
-        add     x10, x5, x10;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x10;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x10;                    \
-        umulh   x4, x4, x10;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x10;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x11, x11, x5;                   \
-        sbcs    x12, x12, x4;                   \
-        sbcs    x13, x13, x3;                   \
-        sbcs    x2, x2, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        lsl     x5, x11, #32;                   \
-        add     x11, x5, x11;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x11;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x11;                    \
-        umulh   x4, x4, x11;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x11;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x12, x12, x5;                   \
-        sbcs    x13, x13, x4;                   \
-        sbcs    x2, x2, x3;                     \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbc     x11, x11, xzr;                  \
-        lsl     x5, x12, #32;                   \
-        add     x12, x5, x12;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x12;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x12;                    \
-        umulh   x4, x4, x12;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x12;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x13, x13, x5;                   \
-        sbcs    x2, x2, x4;                     \
-        sbcs    x9, x9, x3;                     \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbc     x12, x12, xzr;                  \
-        lsl     x5, x13, #32;                   \
-        add     x13, x5, x13;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x13;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x13;                    \
-        umulh   x4, x4, x13;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x13;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x2, x2, x5;                     \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        adds    x2, x2, x14;                    \
-        adcs    x9, x9, x15;                    \
-        adcs    x10, x10, x16;                  \
-        adcs    x11, x11, x17;                  \
-        adcs    x12, x12, x19;                  \
-        adcs    x13, x13, x20;                  \
-        adc     x6, xzr, xzr;                   \
-        mov     x8, #-4294967295;               \
-        adds    x14, x2, x8;                    \
-        mov     x8, #4294967295;                \
-        adcs    x15, x9, x8;                    \
-        mov     x8, #1;                         \
-        adcs    x16, x10, x8;                   \
-        adcs    x17, x11, xzr;                  \
-        adcs    x19, x12, xzr;                  \
-        adcs    x20, x13, xzr;                  \
-        adcs    x6, x6, xzr;                    \
-        csel    x2, x2, x14, eq;                \
-        csel    x9, x9, x15, eq;                \
-        csel    x10, x10, x16, eq;              \
-        csel    x11, x11, x17, eq;              \
-        csel    x12, x12, x19, eq;              \
-        csel    x13, x13, x20, eq;              \
-        stp     x2, x9, [P0];                   \
-        stp     x10, x11, [P0+16];              \
-        stp     x12, x13, [P0+32]
-
-// Corresponds exactly to bignum_sub_p384
-
-#define sub_p384(P0,P1,P2)                      \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        csetm   x3, lo;                         \
-        mov     x4, #4294967295;                \
-        and     x4, x4, x3;                     \
-        adds    x5, x5, x4;                     \
-        eor     x4, x4, x3;                     \
-        adcs    x6, x6, x4;                     \
-        mov     x4, #-2;                        \
-        and     x4, x4, x3;                     \
-        adcs    x7, x7, x4;                     \
-        adcs    x8, x8, x3;                     \
-        adcs    x9, x9, x3;                     \
-        adc     x10, x10, x3;                   \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32]
-
-// Corresponds exactly to bignum_add_p384
-
-#define add_p384(P0,P1,P2)                      \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        adds    x5, x5, x4;                     \
-        adcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        adcs    x7, x7, x4;                     \
-        adcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        adcs    x9, x9, x4;                     \
-        adcs    x10, x10, x3;                   \
-        adc     x3, xzr, xzr;                   \
-        mov     x4, #0xffffffff;                \
-        cmp     x5, x4;                         \
-        mov     x4, #0xffffffff00000000;        \
-        sbcs    xzr, x6, x4;                    \
-        mov     x4, #0xfffffffffffffffe;        \
-        sbcs    xzr, x7, x4;                    \
-        adcs    xzr, x8, xzr;                   \
-        adcs    xzr, x9, xzr;                   \
-        adcs    xzr, x10, xzr;                  \
-        adcs    x3, x3, xzr;                    \
-        csetm   x3, ne;                         \
-        mov     x4, #0xffffffff;                \
-        and     x4, x4, x3;                     \
-        subs    x5, x5, x4;                     \
-        eor     x4, x4, x3;                     \
-        sbcs    x6, x6, x4;                     \
-        mov     x4, #0xfffffffffffffffe;        \
-        and     x4, x4, x3;                     \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        sbcs    x9, x9, x3;                     \
-        sbc     x10, x10, x3;                   \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32]
-
-// P0 = 4 * P1 - P2
-
-#define cmsub41_p384(P0,P1,P2)                  \
-        ldp     x1, x2, [P1];                   \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P1+32];                \
-        lsl     x0, x1, #2;                     \
-        ldp     x7, x8, [P2];                   \
-        subs    x0, x0, x7;                     \
-        extr    x1, x2, x1, #62;                \
-        sbcs    x1, x1, x8;                     \
-        ldp     x7, x8, [P2+16];                \
-        extr    x2, x3, x2, #62;                \
-        sbcs    x2, x2, x7;                     \
-        extr    x3, x4, x3, #62;                \
-        sbcs    x3, x3, x8;                     \
-        extr    x4, x5, x4, #62;                \
-        ldp     x7, x8, [P2+32];                \
-        sbcs    x4, x4, x7;                     \
-        extr    x5, x6, x5, #62;                \
-        sbcs    x5, x5, x8;                     \
-        lsr     x6, x6, #62;                    \
-        adc     x6, x6, xzr;                    \
-        lsl     x7, x6, #32;                    \
-        subs    x8, x6, x7;                     \
-        sbc     x7, x7, xzr;                    \
-        adds    x0, x0, x8;                     \
-        adcs    x1, x1, x7;                     \
-        adcs    x2, x2, x6;                     \
-        adcs    x3, x3, xzr;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        csetm   x8, cc;                         \
-        mov     x9, #0xffffffff;                \
-        and     x9, x9, x8;                     \
-        adds    x0, x0, x9;                     \
-        eor     x9, x9, x8;                     \
-        adcs    x1, x1, x9;                     \
-        mov     x9, #0xfffffffffffffffe;        \
-        and     x9, x9, x8;                     \
-        adcs    x2, x2, x9;                     \
-        adcs    x3, x3, x8;                     \
-        adcs    x4, x4, x8;                     \
-        adc     x5, x5, x8;                     \
-        stp     x0, x1, [P0];                   \
-        stp     x2, x3, [P0+16];                \
-        stp     x4, x5, [P0+32]
-
-// P0 = C * P1 - D * P2
-
-#define cmsub_p384(P0,C,P1,D,P2)                \
-        ldp     x0, x1, [P2];                   \
-        mov     x6, #0x00000000ffffffff;        \
-        subs    x6, x6, x0;                     \
-        mov     x7, #0xffffffff00000000;        \
-        sbcs    x7, x7, x1;                     \
-        ldp     x0, x1, [P2+16];                \
-        mov     x8, #0xfffffffffffffffe;        \
-        sbcs    x8, x8, x0;                     \
-        mov     x13, #0xffffffffffffffff;       \
-        sbcs    x9, x13, x1;                    \
-        ldp     x0, x1, [P2+32];                \
-        sbcs    x10, x13, x0;                   \
-        sbc     x11, x13, x1;                   \
-        mov     x12, D;                         \
-        mul     x0, x12, x6;                    \
-        mul     x1, x12, x7;                    \
-        mul     x2, x12, x8;                    \
-        mul     x3, x12, x9;                    \
-        mul     x4, x12, x10;                   \
-        mul     x5, x12, x11;                   \
-        umulh   x6, x12, x6;                    \
-        umulh   x7, x12, x7;                    \
-        umulh   x8, x12, x8;                    \
-        umulh   x9, x12, x9;                    \
-        umulh   x10, x12, x10;                  \
-        umulh   x12, x12, x11;                  \
-        adds    x1, x1, x6;                     \
-        adcs    x2, x2, x7;                     \
-        adcs    x3, x3, x8;                     \
-        adcs    x4, x4, x9;                     \
-        adcs    x5, x5, x10;                    \
-        mov     x6, #1;                         \
-        adc     x6, x12, x6;                    \
-        ldp     x8, x9, [P1];                   \
-        ldp     x10, x11, [P1+16];              \
-        ldp     x12, x13, [P1+32];              \
-        mov     x14, C;                         \
-        mul     x15, x14, x8;                   \
-        umulh   x8, x14, x8;                    \
-        adds    x0, x0, x15;                    \
-        mul     x15, x14, x9;                   \
-        umulh   x9, x14, x9;                    \
-        adcs    x1, x1, x15;                    \
-        mul     x15, x14, x10;                  \
-        umulh   x10, x14, x10;                  \
-        adcs    x2, x2, x15;                    \
-        mul     x15, x14, x11;                  \
-        umulh   x11, x14, x11;                  \
-        adcs    x3, x3, x15;                    \
-        mul     x15, x14, x12;                  \
-        umulh   x12, x14, x12;                  \
-        adcs    x4, x4, x15;                    \
-        mul     x15, x14, x13;                  \
-        umulh   x13, x14, x13;                  \
-        adcs    x5, x5, x15;                    \
-        adc     x6, x6, xzr;                    \
-        adds    x1, x1, x8;                     \
-        adcs    x2, x2, x9;                     \
-        adcs    x3, x3, x10;                    \
-        adcs    x4, x4, x11;                    \
-        adcs    x5, x5, x12;                    \
-        adcs    x6, x6, x13;                    \
-        lsl     x7, x6, #32;                    \
-        subs    x8, x6, x7;                     \
-        sbc     x7, x7, xzr;                    \
-        adds    x0, x0, x8;                     \
-        adcs    x1, x1, x7;                     \
-        adcs    x2, x2, x6;                     \
-        adcs    x3, x3, xzr;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        csetm   x6, cc;                         \
-        mov     x7, #0xffffffff;                \
-        and     x7, x7, x6;                     \
-        adds    x0, x0, x7;                     \
-        eor     x7, x7, x6;                     \
-        adcs    x1, x1, x7;                     \
-        mov     x7, #0xfffffffffffffffe;        \
-        and     x7, x7, x6;                     \
-        adcs    x2, x2, x7;                     \
-        adcs    x3, x3, x6;                     \
-        adcs    x4, x4, x6;                     \
-        adc     x5, x5, x6;                     \
-        stp     x0, x1, [P0];                   \
-        stp     x2, x3, [P0+16];                \
-        stp     x4, x5, [P0+32]
-
-// A weak version of add that only guarantees sum in 6 digits
-
-#define weakadd_p384(P0,P1,P2)                  \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        adds    x5, x5, x4;                     \
-        adcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        adcs    x7, x7, x4;                     \
-        adcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        adcs    x9, x9, x4;                     \
-        adcs    x10, x10, x3;                   \
-        csetm   x3, cs;                         \
-        mov     x4, #0xffffffff;                \
-        and     x4, x4, x3;                     \
-        subs    x5, x5, x4;                     \
-        eor     x4, x4, x3;                     \
-        sbcs    x6, x6, x4;                     \
-        mov     x4, #0xfffffffffffffffe;        \
-        and     x4, x4, x3;                     \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        sbcs    x9, x9, x3;                     \
-        sbc     x10, x10, x3;                   \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32]
-
-// P0 = 3 * P1 - 8 * P2
-
-#define cmsub38_p384(P0,P1,P2)                  \
-        ldp     x0, x1, [P2];                   \
-        mov     x6, #0x00000000ffffffff;        \
-        subs    x6, x6, x0;                     \
-        mov     x7, #0xffffffff00000000;        \
-        sbcs    x7, x7, x1;                     \
-        ldp     x0, x1, [P2+16];                \
-        mov     x8, #0xfffffffffffffffe;        \
-        sbcs    x8, x8, x0;                     \
-        mov     x13, #0xffffffffffffffff;       \
-        sbcs    x9, x13, x1;                    \
-        ldp     x0, x1, [P2+32];                \
-        sbcs    x10, x13, x0;                   \
-        sbc     x11, x13, x1;                   \
-        lsl     x0, x6, #3;                     \
-        extr    x1, x7, x6, #61;                \
-        extr    x2, x8, x7, #61;                \
-        extr    x3, x9, x8, #61;                \
-        extr    x4, x10, x9, #61;               \
-        extr    x5, x11, x10, #61;              \
-        lsr     x6, x11, #61;                   \
-        add     x6, x6, #1;                     \
-        ldp     x8, x9, [P1];                   \
-        ldp     x10, x11, [P1+16];              \
-        ldp     x12, x13, [P1+32];              \
-        mov     x14, 3;                         \
-        mul     x15, x14, x8;                   \
-        umulh   x8, x14, x8;                    \
-        adds    x0, x0, x15;                    \
-        mul     x15, x14, x9;                   \
-        umulh   x9, x14, x9;                    \
-        adcs    x1, x1, x15;                    \
-        mul     x15, x14, x10;                  \
-        umulh   x10, x14, x10;                  \
-        adcs    x2, x2, x15;                    \
-        mul     x15, x14, x11;                  \
-        umulh   x11, x14, x11;                  \
-        adcs    x3, x3, x15;                    \
-        mul     x15, x14, x12;                  \
-        umulh   x12, x14, x12;                  \
-        adcs    x4, x4, x15;                    \
-        mul     x15, x14, x13;                  \
-        umulh   x13, x14, x13;                  \
-        adcs    x5, x5, x15;                    \
-        adc     x6, x6, xzr;                    \
-        adds    x1, x1, x8;                     \
-        adcs    x2, x2, x9;                     \
-        adcs    x3, x3, x10;                    \
-        adcs    x4, x4, x11;                    \
-        adcs    x5, x5, x12;                    \
-        adcs    x6, x6, x13;                    \
-        lsl     x7, x6, #32;                    \
-        subs    x8, x6, x7;                     \
-        sbc     x7, x7, xzr;                    \
-        adds    x0, x0, x8;                     \
-        adcs    x1, x1, x7;                     \
-        adcs    x2, x2, x6;                     \
-        adcs    x3, x3, xzr;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        csetm   x6, cc;                         \
-        mov     x7, #0xffffffff;                \
-        and     x7, x7, x6;                     \
-        adds    x0, x0, x7;                     \
-        eor     x7, x7, x6;                     \
-        adcs    x1, x1, x7;                     \
-        mov     x7, #0xfffffffffffffffe;        \
-        and     x7, x7, x6;                     \
-        adcs    x2, x2, x7;                     \
-        adcs    x3, x3, x6;                     \
-        adcs    x4, x4, x6;                     \
-        adc     x5, x5, x6;                     \
-        stp     x0, x1, [P0];                   \
-        stp     x2, x3, [P0+16];                \
-        stp     x4, x5, [P0+32]
-
-S2N_BN_SYMBOL(p384_montjdouble_alt):
-
-// Save regs and make room on stack for temporary variables
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        sub     sp, sp, NSPACE
-
-// Move the input arguments to stable places
-
-        mov     input_z, x0
-        mov     input_x, x1
-
-// Main code, just a sequence of basic field operations
-
-// z2 = z^2
-// y2 = y^2
-
-        montsqr_p384(z2,z_1)
-        montsqr_p384(y2,y_1)
-
-// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
-
-        weakadd_p384(t1,x_1,z2)
-        sub_p384(t2,x_1,z2)
-        montmul_p384(x2p,t1,t2)
-
-// t1 = y + z
-// x4p = x2p^2
-// xy2 = x * y^2
-
-        add_p384(t1,y_1,z_1)
-        montsqr_p384(x4p,x2p)
-        montmul_p384(xy2,x_1,y2)
-
-// t2 = (y + z)^2
-
-        montsqr_p384(t2,t1)
-
-// d = 12 * xy2 - 9 * x4p
-// t1 = y^2 + 2 * y * z
-
-        cmsub_p384(d,12,xy2,9,x4p)
-        sub_p384(t1,t2,z2)
-
-// y4 = y^4
-
-        montsqr_p384(y4,y2)
-
-// z_3' = 2 * y * z
-// dx2 = d * x2p
-
-        sub_p384(z_3,t1,y2)
-        montmul_p384(dx2,d,x2p)
-
-// x' = 4 * xy2 - d
-
-        cmsub41_p384(x_3,xy2,d)
-
-// y' = 3 * dx2 - 8 * y4
-
-        cmsub38_p384(y_3,dx2,y4)
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S b/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S
deleted file mode 100644
index f340e4f5ce6..00000000000
--- a/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S
+++ /dev/null
@@ -1,876 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates
-//
-//    extern void p384_montjmixadd
-//      (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]);
-//
-// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
-// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384.
-// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
-// The "mixed" part means that p2 only has x and y coordinates, with the
-// implicit z coordinate assumed to be the identity.
-//
-// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd)
-        .text
-        .balign 4
-
-// Size of individual field elements
-
-#define NUMSIZE 48
-
-// Stable homes for input arguments during main code sequence
-
-#define input_z x24
-#define input_x x25
-#define input_y x26
-
-// Pointer-offset pairs for inputs and outputs
-
-#define x_1 input_x, #0
-#define y_1 input_x, #NUMSIZE
-#define z_1 input_x, #(2*NUMSIZE)
-
-#define x_2 input_y, #0
-#define y_2 input_y, #NUMSIZE
-
-#define x_3 input_z, #0
-#define y_3 input_z, #NUMSIZE
-#define z_3 input_z, #(2*NUMSIZE)
-
-// Pointer-offset pairs for temporaries, with some aliasing
-// NSPACE is the total stack needed for these temporaries
-
-#define zp2 sp, #(NUMSIZE*0)
-#define ww sp, #(NUMSIZE*0)
-#define resx sp, #(NUMSIZE*0)
-
-#define yd sp, #(NUMSIZE*1)
-#define y2a sp, #(NUMSIZE*1)
-
-#define x2a sp, #(NUMSIZE*2)
-#define zzx2 sp, #(NUMSIZE*2)
-
-#define zz sp, #(NUMSIZE*3)
-#define t1 sp, #(NUMSIZE*3)
-
-#define t2 sp, #(NUMSIZE*4)
-#define zzx1 sp, #(NUMSIZE*4)
-#define resy sp, #(NUMSIZE*4)
-
-#define xd sp, #(NUMSIZE*5)
-#define resz sp, #(NUMSIZE*5)
-
-#define NSPACE (NUMSIZE*6)
-
-// Corresponds to bignum_montmul_p384 except x24 -> x0
-
-#define montmul_p384(P0,P1,P2)                  \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P1+32];                \
-        ldp     x9, x10, [P2];                  \
-        ldp     x11, x12, [P2+16];              \
-        ldp     x13, x14, [P2+32];              \
-        mul     x15, x3, x9;                    \
-        mul     x21, x4, x10;                   \
-        mul     x22, x5, x11;                   \
-        umulh   x23, x3, x9;                    \
-        umulh   x0, x4, x10;                    \
-        umulh   x1, x5, x11;                    \
-        adds    x23, x23, x21;                  \
-        adcs    x0, x0, x22;                    \
-        adc     x1, x1, xzr;                    \
-        adds    x16, x23, x15;                  \
-        adcs    x17, x0, x23;                   \
-        adcs    x19, x1, x0;                    \
-        adc     x20, x1, xzr;                   \
-        adds    x17, x17, x15;                  \
-        adcs    x19, x19, x23;                  \
-        adcs    x20, x20, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x0, x3, x4;                     \
-        cneg    x0, x0, lo;                     \
-        csetm   x23, lo;                        \
-        subs    x22, x10, x9;                   \
-        cneg    x22, x22, lo;                   \
-        mul     x21, x0, x22;                   \
-        umulh   x22, x0, x22;                   \
-        cinv    x23, x23, lo;                   \
-        eor     x21, x21, x23;                  \
-        eor     x22, x22, x23;                  \
-        cmn     x23, #1;                        \
-        adcs    x16, x16, x21;                  \
-        adcs    x17, x17, x22;                  \
-        adcs    x19, x19, x23;                  \
-        adcs    x20, x20, x23;                  \
-        adc     x1, x1, x23;                    \
-        subs    x0, x3, x5;                     \
-        cneg    x0, x0, lo;                     \
-        csetm   x23, lo;                        \
-        subs    x22, x11, x9;                   \
-        cneg    x22, x22, lo;                   \
-        mul     x21, x0, x22;                   \
-        umulh   x22, x0, x22;                   \
-        cinv    x23, x23, lo;                   \
-        eor     x21, x21, x23;                  \
-        eor     x22, x22, x23;                  \
-        cmn     x23, #1;                        \
-        adcs    x17, x17, x21;                  \
-        adcs    x19, x19, x22;                  \
-        adcs    x20, x20, x23;                  \
-        adc     x1, x1, x23;                    \
-        subs    x0, x4, x5;                     \
-        cneg    x0, x0, lo;                     \
-        csetm   x23, lo;                        \
-        subs    x22, x11, x10;                  \
-        cneg    x22, x22, lo;                   \
-        mul     x21, x0, x22;                   \
-        umulh   x22, x0, x22;                   \
-        cinv    x23, x23, lo;                   \
-        eor     x21, x21, x23;                  \
-        eor     x22, x22, x23;                  \
-        cmn     x23, #1;                        \
-        adcs    x19, x19, x21;                  \
-        adcs    x20, x20, x22;                  \
-        adc     x1, x1, x23;                    \
-        lsl     x23, x15, #32;                  \
-        add     x15, x23, x15;                  \
-        lsr     x23, x15, #32;                  \
-        subs    x23, x23, x15;                  \
-        sbc     x22, x15, xzr;                  \
-        extr    x23, x22, x23, #32;             \
-        lsr     x22, x22, #32;                  \
-        adds    x22, x22, x15;                  \
-        adc     x21, xzr, xzr;                  \
-        subs    x16, x16, x23;                  \
-        sbcs    x17, x17, x22;                  \
-        sbcs    x19, x19, x21;                  \
-        sbcs    x20, x20, xzr;                  \
-        sbcs    x1, x1, xzr;                    \
-        sbc     x15, x15, xzr;                  \
-        lsl     x23, x16, #32;                  \
-        add     x16, x23, x16;                  \
-        lsr     x23, x16, #32;                  \
-        subs    x23, x23, x16;                  \
-        sbc     x22, x16, xzr;                  \
-        extr    x23, x22, x23, #32;             \
-        lsr     x22, x22, #32;                  \
-        adds    x22, x22, x16;                  \
-        adc     x21, xzr, xzr;                  \
-        subs    x17, x17, x23;                  \
-        sbcs    x19, x19, x22;                  \
-        sbcs    x20, x20, x21;                  \
-        sbcs    x1, x1, xzr;                    \
-        sbcs    x15, x15, xzr;                  \
-        sbc     x16, x16, xzr;                  \
-        lsl     x23, x17, #32;                  \
-        add     x17, x23, x17;                  \
-        lsr     x23, x17, #32;                  \
-        subs    x23, x23, x17;                  \
-        sbc     x22, x17, xzr;                  \
-        extr    x23, x22, x23, #32;             \
-        lsr     x22, x22, #32;                  \
-        adds    x22, x22, x17;                  \
-        adc     x21, xzr, xzr;                  \
-        subs    x19, x19, x23;                  \
-        sbcs    x20, x20, x22;                  \
-        sbcs    x1, x1, x21;                    \
-        sbcs    x15, x15, xzr;                  \
-        sbcs    x16, x16, xzr;                  \
-        sbc     x17, x17, xzr;                  \
-        stp     x19, x20, [P0];                 \
-        stp     x1, x15, [P0+16];               \
-        stp     x16, x17, [P0+32];              \
-        mul     x15, x6, x12;                   \
-        mul     x21, x7, x13;                   \
-        mul     x22, x8, x14;                   \
-        umulh   x23, x6, x12;                   \
-        umulh   x0, x7, x13;                    \
-        umulh   x1, x8, x14;                    \
-        adds    x23, x23, x21;                  \
-        adcs    x0, x0, x22;                    \
-        adc     x1, x1, xzr;                    \
-        adds    x16, x23, x15;                  \
-        adcs    x17, x0, x23;                   \
-        adcs    x19, x1, x0;                    \
-        adc     x20, x1, xzr;                   \
-        adds    x17, x17, x15;                  \
-        adcs    x19, x19, x23;                  \
-        adcs    x20, x20, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x0, x6, x7;                     \
-        cneg    x0, x0, lo;                     \
-        csetm   x23, lo;                        \
-        subs    x22, x13, x12;                  \
-        cneg    x22, x22, lo;                   \
-        mul     x21, x0, x22;                   \
-        umulh   x22, x0, x22;                   \
-        cinv    x23, x23, lo;                   \
-        eor     x21, x21, x23;                  \
-        eor     x22, x22, x23;                  \
-        cmn     x23, #1;                        \
-        adcs    x16, x16, x21;                  \
-        adcs    x17, x17, x22;                  \
-        adcs    x19, x19, x23;                  \
-        adcs    x20, x20, x23;                  \
-        adc     x1, x1, x23;                    \
-        subs    x0, x6, x8;                     \
-        cneg    x0, x0, lo;                     \
-        csetm   x23, lo;                        \
-        subs    x22, x14, x12;                  \
-        cneg    x22, x22, lo;                   \
-        mul     x21, x0, x22;                   \
-        umulh   x22, x0, x22;                   \
-        cinv    x23, x23, lo;                   \
-        eor     x21, x21, x23;                  \
-        eor     x22, x22, x23;                  \
-        cmn     x23, #1;                        \
-        adcs    x17, x17, x21;                  \
-        adcs    x19, x19, x22;                  \
-        adcs    x20, x20, x23;                  \
-        adc     x1, x1, x23;                    \
-        subs    x0, x7, x8;                     \
-        cneg    x0, x0, lo;                     \
-        csetm   x23, lo;                        \
-        subs    x22, x14, x13;                  \
-        cneg    x22, x22, lo;                   \
-        mul     x21, x0, x22;                   \
-        umulh   x22, x0, x22;                   \
-        cinv    x23, x23, lo;                   \
-        eor     x21, x21, x23;                  \
-        eor     x22, x22, x23;                  \
-        cmn     x23, #1;                        \
-        adcs    x19, x19, x21;                  \
-        adcs    x20, x20, x22;                  \
-        adc     x1, x1, x23;                    \
-        subs    x6, x6, x3;                     \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x5;                     \
-        ngc     x3, xzr;                        \
-        cmn     x3, #1;                         \
-        eor     x6, x6, x3;                     \
-        adcs    x6, x6, xzr;                    \
-        eor     x7, x7, x3;                     \
-        adcs    x7, x7, xzr;                    \
-        eor     x8, x8, x3;                     \
-        adc     x8, x8, xzr;                    \
-        subs    x9, x9, x12;                    \
-        sbcs    x10, x10, x13;                  \
-        sbcs    x11, x11, x14;                  \
-        ngc     x14, xzr;                       \
-        cmn     x14, #1;                        \
-        eor     x9, x9, x14;                    \
-        adcs    x9, x9, xzr;                    \
-        eor     x10, x10, x14;                  \
-        adcs    x10, x10, xzr;                  \
-        eor     x11, x11, x14;                  \
-        adc     x11, x11, xzr;                  \
-        eor     x14, x3, x14;                   \
-        ldp     x21, x22, [P0];                 \
-        adds    x15, x15, x21;                  \
-        adcs    x16, x16, x22;                  \
-        ldp     x21, x22, [P0+16];              \
-        adcs    x17, x17, x21;                  \
-        adcs    x19, x19, x22;                  \
-        ldp     x21, x22, [P0+32];              \
-        adcs    x20, x20, x21;                  \
-        adcs    x1, x1, x22;                    \
-        adc     x2, xzr, xzr;                   \
-        stp     x15, x16, [P0];                 \
-        stp     x17, x19, [P0+16];              \
-        stp     x20, x1, [P0+32];               \
-        mul     x15, x6, x9;                    \
-        mul     x21, x7, x10;                   \
-        mul     x22, x8, x11;                   \
-        umulh   x23, x6, x9;                    \
-        umulh   x0, x7, x10;                    \
-        umulh   x1, x8, x11;                    \
-        adds    x23, x23, x21;                  \
-        adcs    x0, x0, x22;                    \
-        adc     x1, x1, xzr;                    \
-        adds    x16, x23, x15;                  \
-        adcs    x17, x0, x23;                   \
-        adcs    x19, x1, x0;                    \
-        adc     x20, x1, xzr;                   \
-        adds    x17, x17, x15;                  \
-        adcs    x19, x19, x23;                  \
-        adcs    x20, x20, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x0, x6, x7;                     \
-        cneg    x0, x0, lo;                     \
-        csetm   x23, lo;                        \
-        subs    x22, x10, x9;                   \
-        cneg    x22, x22, lo;                   \
-        mul     x21, x0, x22;                   \
-        umulh   x22, x0, x22;                   \
-        cinv    x23, x23, lo;                   \
-        eor     x21, x21, x23;                  \
-        eor     x22, x22, x23;                  \
-        cmn     x23, #1;                        \
-        adcs    x16, x16, x21;                  \
-        adcs    x17, x17, x22;                  \
-        adcs    x19, x19, x23;                  \
-        adcs    x20, x20, x23;                  \
-        adc     x1, x1, x23;                    \
-        subs    x0, x6, x8;                     \
-        cneg    x0, x0, lo;                     \
-        csetm   x23, lo;                        \
-        subs    x22, x11, x9;                   \
-        cneg    x22, x22, lo;                   \
-        mul     x21, x0, x22;                   \
-        umulh   x22, x0, x22;                   \
-        cinv    x23, x23, lo;                   \
-        eor     x21, x21, x23;                  \
-        eor     x22, x22, x23;                  \
-        cmn     x23, #1;                        \
-        adcs    x17, x17, x21;                  \
-        adcs    x19, x19, x22;                  \
-        adcs    x20, x20, x23;                  \
-        adc     x1, x1, x23;                    \
-        subs    x0, x7, x8;                     \
-        cneg    x0, x0, lo;                     \
-        csetm   x23, lo;                        \
-        subs    x22, x11, x10;                  \
-        cneg    x22, x22, lo;                   \
-        mul     x21, x0, x22;                   \
-        umulh   x22, x0, x22;                   \
-        cinv    x23, x23, lo;                   \
-        eor     x21, x21, x23;                  \
-        eor     x22, x22, x23;                  \
-        cmn     x23, #1;                        \
-        adcs    x19, x19, x21;                  \
-        adcs    x20, x20, x22;                  \
-        adc     x1, x1, x23;                    \
-        ldp     x3, x4, [P0];                   \
-        ldp     x5, x6, [P0+16];                \
-        ldp     x7, x8, [P0+32];                \
-        cmn     x14, #1;                        \
-        eor     x15, x15, x14;                  \
-        adcs    x15, x15, x3;                   \
-        eor     x16, x16, x14;                  \
-        adcs    x16, x16, x4;                   \
-        eor     x17, x17, x14;                  \
-        adcs    x17, x17, x5;                   \
-        eor     x19, x19, x14;                  \
-        adcs    x19, x19, x6;                   \
-        eor     x20, x20, x14;                  \
-        adcs    x20, x20, x7;                   \
-        eor     x1, x1, x14;                    \
-        adcs    x1, x1, x8;                     \
-        adcs    x9, x14, x2;                    \
-        adcs    x10, x14, xzr;                  \
-        adcs    x11, x14, xzr;                  \
-        adc     x12, x14, xzr;                  \
-        adds    x19, x19, x3;                   \
-        adcs    x20, x20, x4;                   \
-        adcs    x1, x1, x5;                     \
-        adcs    x9, x9, x6;                     \
-        adcs    x10, x10, x7;                   \
-        adcs    x11, x11, x8;                   \
-        adc     x12, x12, x2;                   \
-        lsl     x23, x15, #32;                  \
-        add     x15, x23, x15;                  \
-        lsr     x23, x15, #32;                  \
-        subs    x23, x23, x15;                  \
-        sbc     x22, x15, xzr;                  \
-        extr    x23, x22, x23, #32;             \
-        lsr     x22, x22, #32;                  \
-        adds    x22, x22, x15;                  \
-        adc     x21, xzr, xzr;                  \
-        subs    x16, x16, x23;                  \
-        sbcs    x17, x17, x22;                  \
-        sbcs    x19, x19, x21;                  \
-        sbcs    x20, x20, xzr;                  \
-        sbcs    x1, x1, xzr;                    \
-        sbc     x15, x15, xzr;                  \
-        lsl     x23, x16, #32;                  \
-        add     x16, x23, x16;                  \
-        lsr     x23, x16, #32;                  \
-        subs    x23, x23, x16;                  \
-        sbc     x22, x16, xzr;                  \
-        extr    x23, x22, x23, #32;             \
-        lsr     x22, x22, #32;                  \
-        adds    x22, x22, x16;                  \
-        adc     x21, xzr, xzr;                  \
-        subs    x17, x17, x23;                  \
-        sbcs    x19, x19, x22;                  \
-        sbcs    x20, x20, x21;                  \
-        sbcs    x1, x1, xzr;                    \
-        sbcs    x15, x15, xzr;                  \
-        sbc     x16, x16, xzr;                  \
-        lsl     x23, x17, #32;                  \
-        add     x17, x23, x17;                  \
-        lsr     x23, x17, #32;                  \
-        subs    x23, x23, x17;                  \
-        sbc     x22, x17, xzr;                  \
-        extr    x23, x22, x23, #32;             \
-        lsr     x22, x22, #32;                  \
-        adds    x22, x22, x17;                  \
-        adc     x21, xzr, xzr;                  \
-        subs    x19, x19, x23;                  \
-        sbcs    x20, x20, x22;                  \
-        sbcs    x1, x1, x21;                    \
-        sbcs    x15, x15, xzr;                  \
-        sbcs    x16, x16, xzr;                  \
-        sbc     x17, x17, xzr;                  \
-        adds    x9, x9, x15;                    \
-        adcs    x10, x10, x16;                  \
-        adcs    x11, x11, x17;                  \
-        adc     x12, x12, xzr;                  \
-        add     x22, x12, #1;                   \
-        lsl     x21, x22, #32;                  \
-        subs    x0, x22, x21;                   \
-        sbc     x21, x21, xzr;                  \
-        adds    x19, x19, x0;                   \
-        adcs    x20, x20, x21;                  \
-        adcs    x1, x1, x22;                    \
-        adcs    x9, x9, xzr;                    \
-        adcs    x10, x10, xzr;                  \
-        adcs    x11, x11, xzr;                  \
-        csetm   x22, lo;                        \
-        mov     x23, #4294967295;               \
-        and     x23, x23, x22;                  \
-        adds    x19, x19, x23;                  \
-        eor     x23, x23, x22;                  \
-        adcs    x20, x20, x23;                  \
-        mov     x23, #-2;                       \
-        and     x23, x23, x22;                  \
-        adcs    x1, x1, x23;                    \
-        adcs    x9, x9, x22;                    \
-        adcs    x10, x10, x22;                  \
-        adc     x11, x11, x22;                  \
-        stp     x19, x20, [P0];                 \
-        stp     x1, x9, [P0+16];                \
-        stp     x10, x11, [P0+32]
-
-// Corresponds exactly to bignum_montsqr_p384
-
-#define montsqr_p384(P0,P1)                     \
-        ldp     x2, x3, [P1];                   \
-        ldp     x4, x5, [P1+16];                \
-        ldp     x6, x7, [P1+32];                \
-        mul     x14, x2, x3;                    \
-        mul     x15, x2, x4;                    \
-        mul     x16, x3, x4;                    \
-        mul     x8, x2, x2;                     \
-        mul     x10, x3, x3;                    \
-        mul     x12, x4, x4;                    \
-        umulh   x17, x2, x3;                    \
-        adds    x15, x15, x17;                  \
-        umulh   x17, x2, x4;                    \
-        adcs    x16, x16, x17;                  \
-        umulh   x17, x3, x4;                    \
-        adcs    x17, x17, xzr;                  \
-        umulh   x9, x2, x2;                     \
-        umulh   x11, x3, x3;                    \
-        umulh   x13, x4, x4;                    \
-        adds    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adcs    x17, x17, x17;                  \
-        adc     x13, x13, xzr;                  \
-        adds    x9, x9, x14;                    \
-        adcs    x10, x10, x15;                  \
-        adcs    x11, x11, x16;                  \
-        adcs    x12, x12, x17;                  \
-        adc     x13, x13, xzr;                  \
-        lsl     x16, x8, #32;                   \
-        add     x8, x16, x8;                    \
-        lsr     x16, x8, #32;                   \
-        subs    x16, x16, x8;                   \
-        sbc     x15, x8, xzr;                   \
-        extr    x16, x15, x16, #32;             \
-        lsr     x15, x15, #32;                  \
-        adds    x15, x15, x8;                   \
-        adc     x14, xzr, xzr;                  \
-        subs    x9, x9, x16;                    \
-        sbcs    x10, x10, x15;                  \
-        sbcs    x11, x11, x14;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x8, x8, xzr;                    \
-        lsl     x16, x9, #32;                   \
-        add     x9, x16, x9;                    \
-        lsr     x16, x9, #32;                   \
-        subs    x16, x16, x9;                   \
-        sbc     x15, x9, xzr;                   \
-        extr    x16, x15, x16, #32;             \
-        lsr     x15, x15, #32;                  \
-        adds    x15, x15, x9;                   \
-        adc     x14, xzr, xzr;                  \
-        subs    x10, x10, x16;                  \
-        sbcs    x11, x11, x15;                  \
-        sbcs    x12, x12, x14;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x8, x8, xzr;                    \
-        sbc     x9, x9, xzr;                    \
-        lsl     x16, x10, #32;                  \
-        add     x10, x16, x10;                  \
-        lsr     x16, x10, #32;                  \
-        subs    x16, x16, x10;                  \
-        sbc     x15, x10, xzr;                  \
-        extr    x16, x15, x16, #32;             \
-        lsr     x15, x15, #32;                  \
-        adds    x15, x15, x10;                  \
-        adc     x14, xzr, xzr;                  \
-        subs    x11, x11, x16;                  \
-        sbcs    x12, x12, x15;                  \
-        sbcs    x13, x13, x14;                  \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        stp     x11, x12, [P0];                 \
-        stp     x13, x8, [P0+16];               \
-        stp     x9, x10, [P0+32];               \
-        mul     x8, x2, x5;                     \
-        mul     x14, x3, x6;                    \
-        mul     x15, x4, x7;                    \
-        umulh   x16, x2, x5;                    \
-        umulh   x17, x3, x6;                    \
-        umulh   x1, x4, x7;                     \
-        adds    x16, x16, x14;                  \
-        adcs    x17, x17, x15;                  \
-        adc     x1, x1, xzr;                    \
-        adds    x9, x16, x8;                    \
-        adcs    x10, x17, x16;                  \
-        adcs    x11, x1, x17;                   \
-        adc     x12, x1, xzr;                   \
-        adds    x10, x10, x8;                   \
-        adcs    x11, x11, x16;                  \
-        adcs    x12, x12, x17;                  \
-        adc     x13, x1, xzr;                   \
-        subs    x17, x2, x3;                    \
-        cneg    x17, x17, lo;                   \
-        csetm   x14, lo;                        \
-        subs    x15, x6, x5;                    \
-        cneg    x15, x15, lo;                   \
-        mul     x16, x17, x15;                  \
-        umulh   x15, x17, x15;                  \
-        cinv    x14, x14, lo;                   \
-        eor     x16, x16, x14;                  \
-        eor     x15, x15, x14;                  \
-        cmn     x14, #1;                        \
-        adcs    x9, x9, x16;                    \
-        adcs    x10, x10, x15;                  \
-        adcs    x11, x11, x14;                  \
-        adcs    x12, x12, x14;                  \
-        adc     x13, x13, x14;                  \
-        subs    x17, x2, x4;                    \
-        cneg    x17, x17, lo;                   \
-        csetm   x14, lo;                        \
-        subs    x15, x7, x5;                    \
-        cneg    x15, x15, lo;                   \
-        mul     x16, x17, x15;                  \
-        umulh   x15, x17, x15;                  \
-        cinv    x14, x14, lo;                   \
-        eor     x16, x16, x14;                  \
-        eor     x15, x15, x14;                  \
-        cmn     x14, #1;                        \
-        adcs    x10, x10, x16;                  \
-        adcs    x11, x11, x15;                  \
-        adcs    x12, x12, x14;                  \
-        adc     x13, x13, x14;                  \
-        subs    x17, x3, x4;                    \
-        cneg    x17, x17, lo;                   \
-        csetm   x14, lo;                        \
-        subs    x15, x7, x6;                    \
-        cneg    x15, x15, lo;                   \
-        mul     x16, x17, x15;                  \
-        umulh   x15, x17, x15;                  \
-        cinv    x14, x14, lo;                   \
-        eor     x16, x16, x14;                  \
-        eor     x15, x15, x14;                  \
-        cmn     x14, #1;                        \
-        adcs    x11, x11, x16;                  \
-        adcs    x12, x12, x15;                  \
-        adc     x13, x13, x14;                  \
-        adds    x8, x8, x8;                     \
-        adcs    x9, x9, x9;                     \
-        adcs    x10, x10, x10;                  \
-        adcs    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adc     x17, xzr, xzr;                  \
-        ldp     x2, x3, [P0];                   \
-        adds    x8, x8, x2;                     \
-        adcs    x9, x9, x3;                     \
-        ldp     x2, x3, [P0+16];                \
-        adcs    x10, x10, x2;                   \
-        adcs    x11, x11, x3;                   \
-        ldp     x2, x3, [P0+32];                \
-        adcs    x12, x12, x2;                   \
-        adcs    x13, x13, x3;                   \
-        adc     x17, x17, xzr;                  \
-        lsl     x4, x8, #32;                    \
-        add     x8, x4, x8;                     \
-        lsr     x4, x8, #32;                    \
-        subs    x4, x4, x8;                     \
-        sbc     x3, x8, xzr;                    \
-        extr    x4, x3, x4, #32;                \
-        lsr     x3, x3, #32;                    \
-        adds    x3, x3, x8;                     \
-        adc     x2, xzr, xzr;                   \
-        subs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        sbcs    x11, x11, x2;                   \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x8, x8, xzr;                    \
-        lsl     x4, x9, #32;                    \
-        add     x9, x4, x9;                     \
-        lsr     x4, x9, #32;                    \
-        subs    x4, x4, x9;                     \
-        sbc     x3, x9, xzr;                    \
-        extr    x4, x3, x4, #32;                \
-        lsr     x3, x3, #32;                    \
-        adds    x3, x3, x9;                     \
-        adc     x2, xzr, xzr;                   \
-        subs    x10, x10, x4;                   \
-        sbcs    x11, x11, x3;                   \
-        sbcs    x12, x12, x2;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x8, x8, xzr;                    \
-        sbc     x9, x9, xzr;                    \
-        lsl     x4, x10, #32;                   \
-        add     x10, x4, x10;                   \
-        lsr     x4, x10, #32;                   \
-        subs    x4, x4, x10;                    \
-        sbc     x3, x10, xzr;                   \
-        extr    x4, x3, x4, #32;                \
-        lsr     x3, x3, #32;                    \
-        adds    x3, x3, x10;                    \
-        adc     x2, xzr, xzr;                   \
-        subs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        sbcs    x13, x13, x2;                   \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        adds    x17, x17, x8;                   \
-        adcs    x8, x9, xzr;                    \
-        adcs    x9, x10, xzr;                   \
-        adcs    x10, xzr, xzr;                  \
-        mul     x1, x5, x5;                     \
-        adds    x11, x11, x1;                   \
-        mul     x14, x6, x6;                    \
-        mul     x15, x7, x7;                    \
-        umulh   x1, x5, x5;                     \
-        adcs    x12, x12, x1;                   \
-        umulh   x1, x6, x6;                     \
-        adcs    x13, x13, x14;                  \
-        adcs    x17, x17, x1;                   \
-        umulh   x1, x7, x7;                     \
-        adcs    x8, x8, x15;                    \
-        adcs    x9, x9, x1;                     \
-        adc     x10, x10, xzr;                  \
-        mul     x1, x5, x6;                     \
-        mul     x14, x5, x7;                    \
-        mul     x15, x6, x7;                    \
-        umulh   x16, x5, x6;                    \
-        adds    x14, x14, x16;                  \
-        umulh   x16, x5, x7;                    \
-        adcs    x15, x15, x16;                  \
-        umulh   x16, x6, x7;                    \
-        adc     x16, x16, xzr;                  \
-        adds    x1, x1, x1;                     \
-        adcs    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adc     x5, xzr, xzr;                   \
-        adds    x12, x12, x1;                   \
-        adcs    x13, x13, x14;                  \
-        adcs    x17, x17, x15;                  \
-        adcs    x8, x8, x16;                    \
-        adcs    x9, x9, x5;                     \
-        adc     x10, x10, xzr;                  \
-        mov     x1, #-4294967295;               \
-        mov     x14, #4294967295;               \
-        mov     x15, #1;                        \
-        cmn     x11, x1;                        \
-        adcs    xzr, x12, x14;                  \
-        adcs    xzr, x13, x15;                  \
-        adcs    xzr, x17, xzr;                  \
-        adcs    xzr, x8, xzr;                   \
-        adcs    xzr, x9, xzr;                   \
-        adc     x10, x10, xzr;                  \
-        neg     x10, x10;                       \
-        and     x1, x1, x10;                    \
-        adds    x11, x11, x1;                   \
-        and     x14, x14, x10;                  \
-        adcs    x12, x12, x14;                  \
-        and     x15, x15, x10;                  \
-        adcs    x13, x13, x15;                  \
-        adcs    x17, x17, xzr;                  \
-        adcs    x8, x8, xzr;                    \
-        adc     x9, x9, xzr;                    \
-        stp     x11, x12, [P0];                 \
-        stp     x13, x17, [P0+16];              \
-        stp     x8, x9, [P0+32]
-
-// Corresponds exactly to bignum_sub_p384
-
-#define sub_p384(P0,P1,P2)                      \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        csetm   x3, lo;                         \
-        mov     x4, #4294967295;                \
-        and     x4, x4, x3;                     \
-        adds    x5, x5, x4;                     \
-        eor     x4, x4, x3;                     \
-        adcs    x6, x6, x4;                     \
-        mov     x4, #-2;                        \
-        and     x4, x4, x3;                     \
-        adcs    x7, x7, x4;                     \
-        adcs    x8, x8, x3;                     \
-        adcs    x9, x9, x3;                     \
-        adc     x10, x10, x3;                   \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32]
-
-S2N_BN_SYMBOL(p384_montjmixadd):
-
-// Save regs and make room on stack for temporary variables
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x26, [sp, #-16]!
-        sub     sp, sp, NSPACE
-
-// Move the input arguments to stable places
-
-        mov     input_z, x0
-        mov     input_x, x1
-        mov     input_y, x2
-
-// Main code, just a sequence of basic field operations
-// 8 * multiply + 3 * square + 7 * subtract
-
-        montsqr_p384(zp2,z_1)
-        montmul_p384(y2a,z_1,y_2)
-
-        montmul_p384(x2a,zp2,x_2)
-        montmul_p384(y2a,zp2,y2a)
-
-        sub_p384(xd,x2a,x_1)
-        sub_p384(yd,y2a,y_1)
-
-        montsqr_p384(zz,xd)
-        montsqr_p384(ww,yd)
-
-        montmul_p384(zzx1,zz,x_1)
-        montmul_p384(zzx2,zz,x2a)
-
-        sub_p384(resx,ww,zzx1)
-        sub_p384(t1,zzx2,zzx1)
-
-        montmul_p384(resz,xd,z_1)
-
-        sub_p384(resx,resx,zzx2)
-
-        sub_p384(t2,zzx1,resx)
-
-        montmul_p384(t1,t1,y_1)
-        montmul_p384(t2,yd,t2)
-
-        sub_p384(resy,t2,t1)
-
-// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
-
-        ldp     x0, x1, [z_1]
-        ldp     x2, x3, [z_1+16]
-        ldp     x4, x5, [z_1+32]
-        orr     x6, x0, x1
-        orr     x7, x2, x3
-        orr     x8, x4, x5
-        orr     x6, x6, x7
-        orr     x6, x6, x8
-        cmp     x6, xzr
-
-// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
-// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
-// Montgomery form so not the simple constant 1 but rather 2^384 - p_384),
-// hence giving 0 + p2 = p2 for the final result.
-
-        ldp     x0, x1, [resx]
-        ldp     x19, x20, [x_2]
-        csel    x0, x0, x19, ne
-        csel    x1, x1, x20, ne
-        ldp     x2, x3, [resx+16]
-        ldp     x19, x20, [x_2+16]
-        csel    x2, x2, x19, ne
-        csel    x3, x3, x20, ne
-        ldp     x4, x5, [resx+32]
-        ldp     x19, x20, [x_2+32]
-        csel    x4, x4, x19, ne
-        csel    x5, x5, x20, ne
-
-        ldp     x6, x7, [resy]
-        ldp     x19, x20, [y_2]
-        csel    x6, x6, x19, ne
-        csel    x7, x7, x20, ne
-        ldp     x8, x9, [resy+16]
-        ldp     x19, x20, [y_2+16]
-        csel    x8, x8, x19, ne
-        csel    x9, x9, x20, ne
-        ldp     x10, x11, [resy+32]
-        ldp     x19, x20, [y_2+32]
-        csel    x10, x10, x19, ne
-        csel    x11, x11, x20, ne
-
-        ldp     x12, x13, [resz]
-        mov     x19, #0xffffffff00000001
-        mov     x20, #0x00000000ffffffff
-        csel    x12, x12, x19, ne
-        csel    x13, x13, x20, ne
-        ldp     x14, x15, [resz+16]
-        mov     x19, #1
-        csel    x14, x14, x19, ne
-        csel    x15, x15, xzr, ne
-        ldp     x16, x17, [resz+32]
-        csel    x16, x16, xzr, ne
-        csel    x17, x17, xzr, ne
-
-        stp     x0, x1, [x_3]
-        stp     x2, x3, [x_3+16]
-        stp     x4, x5, [x_3+32]
-        stp     x6, x7, [y_3]
-        stp     x8, x9, [y_3+16]
-        stp     x10, x11, [y_3+32]
-        stp     x12, x13, [z_3]
-        stp     x14, x15, [z_3+16]
-        stp     x16, x17, [z_3+32]
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-
-        ldp     x25, x26, [sp], 16
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p384/p384_montjmixadd_alt.S b/third_party/s2n-bignum/arm/p384/p384_montjmixadd_alt.S
deleted file mode 100644
index f36301a11ed..00000000000
--- a/third_party/s2n-bignum/arm/p384/p384_montjmixadd_alt.S
+++ /dev/null
@@ -1,941 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates
-//
-//    extern void p384_montjmixadd_alt
-//      (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]);
-//
-// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
-// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384.
-// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
-// The "mixed" part means that p2 only has x and y coordinates, with the
-// implicit z coordinate assumed to be the identity.
-//
-// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd_alt)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd_alt)
-        .text
-        .balign 4
-
-// Size of individual field elements
-
-#define NUMSIZE 48
-
-// Stable homes for input arguments during main code sequence
-
-#define input_z x24
-#define input_x x25
-#define input_y x26
-
-// Pointer-offset pairs for inputs and outputs
-
-#define x_1 input_x, #0
-#define y_1 input_x, #NUMSIZE
-#define z_1 input_x, #(2*NUMSIZE)
-
-#define x_2 input_y, #0
-#define y_2 input_y, #NUMSIZE
-
-#define x_3 input_z, #0
-#define y_3 input_z, #NUMSIZE
-#define z_3 input_z, #(2*NUMSIZE)
-
-// Pointer-offset pairs for temporaries, with some aliasing
-// NSPACE is the total stack needed for these temporaries
-
-#define zp2 sp, #(NUMSIZE*0)
-#define ww sp, #(NUMSIZE*0)
-#define resx sp, #(NUMSIZE*0)
-
-#define yd sp, #(NUMSIZE*1)
-#define y2a sp, #(NUMSIZE*1)
-
-#define x2a sp, #(NUMSIZE*2)
-#define zzx2 sp, #(NUMSIZE*2)
-
-#define zz sp, #(NUMSIZE*3)
-#define t1 sp, #(NUMSIZE*3)
-
-#define t2 sp, #(NUMSIZE*4)
-#define zzx1 sp, #(NUMSIZE*4)
-#define resy sp, #(NUMSIZE*4)
-
-#define xd sp, #(NUMSIZE*5)
-#define resz sp, #(NUMSIZE*5)
-
-#define NSPACE (NUMSIZE*6)
-
-// Corresponds exactly to bignum_montmul_p384_alt
-
-#define montmul_p384(P0,P1,P2)                  \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        mul     x12, x3, x5;                    \
-        umulh   x13, x3, x5;                    \
-        mul     x11, x3, x6;                    \
-        umulh   x14, x3, x6;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x7, x8, [P2+16];                \
-        mul     x11, x3, x7;                    \
-        umulh   x15, x3, x7;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x8;                    \
-        umulh   x16, x3, x8;                    \
-        adcs    x15, x15, x11;                  \
-        ldp     x9, x10, [P2+32];               \
-        mul     x11, x3, x9;                    \
-        umulh   x17, x3, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x19, x3, x10;                   \
-        adcs    x17, x17, x11;                  \
-        adc     x19, x19, xzr;                  \
-        mul     x11, x4, x5;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x6;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x7;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x19, x19, x11;                  \
-        cset    x20, cs;                        \
-        umulh   x11, x4, x5;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x6;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x7;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x17, x17, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x4, x10;                   \
-        adc     x20, x20, x11;                  \
-        ldp     x3, x4, [P1+16];                \
-        mul     x11, x3, x5;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x3, x6;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x3, x7;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x3, x8;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x3, x9;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x3, x10;                   \
-        adcs    x20, x20, x11;                  \
-        cset    x21, cs;                        \
-        umulh   x11, x3, x5;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x3, x6;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x3, x7;                    \
-        adcs    x17, x17, x11;                  \
-        umulh   x11, x3, x8;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x3, x9;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x3, x10;                   \
-        adc     x21, x21, x11;                  \
-        mul     x11, x4, x5;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x4, x6;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x4, x7;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x20, x20, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x21, x21, x11;                  \
-        cset    x22, cs;                        \
-        umulh   x11, x4, x5;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x4, x6;                    \
-        adcs    x17, x17, x11;                  \
-        umulh   x11, x4, x7;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x21, x21, x11;                  \
-        umulh   x11, x4, x10;                   \
-        adc     x22, x22, x11;                  \
-        ldp     x3, x4, [P1+32];                \
-        mul     x11, x3, x5;                    \
-        adds    x16, x16, x11;                  \
-        mul     x11, x3, x6;                    \
-        adcs    x17, x17, x11;                  \
-        mul     x11, x3, x7;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x3, x8;                    \
-        adcs    x20, x20, x11;                  \
-        mul     x11, x3, x9;                    \
-        adcs    x21, x21, x11;                  \
-        mul     x11, x3, x10;                   \
-        adcs    x22, x22, x11;                  \
-        cset    x2, cs;                         \
-        umulh   x11, x3, x5;                    \
-        adds    x17, x17, x11;                  \
-        umulh   x11, x3, x6;                    \
-        adcs    x19, x19, x11;                  \
-        umulh   x11, x3, x7;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x3, x8;                    \
-        adcs    x21, x21, x11;                  \
-        umulh   x11, x3, x9;                    \
-        adcs    x22, x22, x11;                  \
-        umulh   x11, x3, x10;                   \
-        adc     x2, x2, x11;                    \
-        mul     x11, x4, x5;                    \
-        adds    x17, x17, x11;                  \
-        mul     x11, x4, x6;                    \
-        adcs    x19, x19, x11;                  \
-        mul     x11, x4, x7;                    \
-        adcs    x20, x20, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x21, x21, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x22, x22, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x2, x2, x11;                    \
-        cset    x1, cs;                         \
-        umulh   x11, x4, x5;                    \
-        adds    x19, x19, x11;                  \
-        umulh   x11, x4, x6;                    \
-        adcs    x20, x20, x11;                  \
-        umulh   x11, x4, x7;                    \
-        adcs    x21, x21, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x22, x22, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x2, x2, x11;                    \
-        umulh   x11, x4, x10;                   \
-        adc     x1, x1, x11;                    \
-        lsl     x7, x12, #32;                   \
-        add     x12, x7, x12;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x12;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x12;                    \
-        umulh   x6, x6, x12;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x12;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x13, x13, x7;                   \
-        sbcs    x14, x14, x6;                   \
-        sbcs    x15, x15, x5;                   \
-        sbcs    x16, x16, xzr;                  \
-        sbcs    x17, x17, xzr;                  \
-        sbc     x12, x12, xzr;                  \
-        lsl     x7, x13, #32;                   \
-        add     x13, x7, x13;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x13;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x13;                    \
-        umulh   x6, x6, x13;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x13;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x14, x14, x7;                   \
-        sbcs    x15, x15, x6;                   \
-        sbcs    x16, x16, x5;                   \
-        sbcs    x17, x17, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        lsl     x7, x14, #32;                   \
-        add     x14, x7, x14;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x14;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x14;                    \
-        umulh   x6, x6, x14;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x14;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x15, x15, x7;                   \
-        sbcs    x16, x16, x6;                   \
-        sbcs    x17, x17, x5;                   \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x14, x14, xzr;                  \
-        lsl     x7, x15, #32;                   \
-        add     x15, x7, x15;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x15;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x15;                    \
-        umulh   x6, x6, x15;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x15;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x16, x16, x7;                   \
-        sbcs    x17, x17, x6;                   \
-        sbcs    x12, x12, x5;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbc     x15, x15, xzr;                  \
-        lsl     x7, x16, #32;                   \
-        add     x16, x7, x16;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x16;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x16;                    \
-        umulh   x6, x6, x16;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x16;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x17, x17, x7;                   \
-        sbcs    x12, x12, x6;                   \
-        sbcs    x13, x13, x5;                   \
-        sbcs    x14, x14, xzr;                  \
-        sbcs    x15, x15, xzr;                  \
-        sbc     x16, x16, xzr;                  \
-        lsl     x7, x17, #32;                   \
-        add     x17, x7, x17;                   \
-        mov     x7, #0xffffffff00000001;        \
-        umulh   x7, x7, x17;                    \
-        mov     x6, #0xffffffff;                \
-        mul     x5, x6, x17;                    \
-        umulh   x6, x6, x17;                    \
-        adds    x7, x7, x5;                     \
-        adcs    x6, x6, x17;                    \
-        adc     x5, xzr, xzr;                   \
-        subs    x12, x12, x7;                   \
-        sbcs    x13, x13, x6;                   \
-        sbcs    x14, x14, x5;                   \
-        sbcs    x15, x15, xzr;                  \
-        sbcs    x16, x16, xzr;                  \
-        sbc     x17, x17, xzr;                  \
-        adds    x12, x12, x19;                  \
-        adcs    x13, x13, x20;                  \
-        adcs    x14, x14, x21;                  \
-        adcs    x15, x15, x22;                  \
-        adcs    x16, x16, x2;                   \
-        adcs    x17, x17, x1;                   \
-        adc     x10, xzr, xzr;                  \
-        mov     x11, #0xffffffff00000001;       \
-        adds    x19, x12, x11;                  \
-        mov     x11, #0xffffffff;               \
-        adcs    x20, x13, x11;                  \
-        mov     x11, #0x1;                      \
-        adcs    x21, x14, x11;                  \
-        adcs    x22, x15, xzr;                  \
-        adcs    x2, x16, xzr;                   \
-        adcs    x1, x17, xzr;                   \
-        adcs    x10, x10, xzr;                  \
-        csel    x12, x12, x19, eq;              \
-        csel    x13, x13, x20, eq;              \
-        csel    x14, x14, x21, eq;              \
-        csel    x15, x15, x22, eq;              \
-        csel    x16, x16, x2, eq;               \
-        csel    x17, x17, x1, eq;               \
-        stp     x12, x13, [P0];                 \
-        stp     x14, x15, [P0+16];              \
-        stp     x16, x17, [P0+32]
-
-// Corresponds exactly to bignum_montsqr_p384_alt
-
-#define montsqr_p384(P0,P1)                     \
-        ldp     x2, x3, [P1];                   \
-        mul     x9, x2, x3;                     \
-        umulh   x10, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x8, x2, x4;                     \
-        adds    x10, x10, x8;                   \
-        mul     x11, x2, x5;                    \
-        mul     x8, x3, x4;                     \
-        adcs    x11, x11, x8;                   \
-        umulh   x12, x2, x5;                    \
-        mul     x8, x3, x5;                     \
-        adcs    x12, x12, x8;                   \
-        ldp     x6, x7, [P1+32];                \
-        mul     x13, x2, x7;                    \
-        mul     x8, x3, x6;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x14, x2, x7;                    \
-        mul     x8, x3, x7;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x15, x5, x6;                    \
-        adcs    x15, x15, xzr;                  \
-        umulh   x16, x5, x6;                    \
-        adc     x16, x16, xzr;                  \
-        umulh   x8, x2, x4;                     \
-        adds    x11, x11, x8;                   \
-        umulh   x8, x3, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x3, x5;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x8, x3, x6;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x3, x7;                     \
-        adcs    x15, x15, x8;                   \
-        adc     x16, x16, xzr;                  \
-        mul     x8, x2, x6;                     \
-        adds    x12, x12, x8;                   \
-        mul     x8, x4, x5;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x4, x6;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x8, x4, x7;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x5, x7;                     \
-        adcs    x16, x16, x8;                   \
-        mul     x17, x6, x7;                    \
-        adcs    x17, x17, xzr;                  \
-        umulh   x19, x6, x7;                    \
-        adc     x19, x19, xzr;                  \
-        umulh   x8, x2, x6;                     \
-        adds    x13, x13, x8;                   \
-        umulh   x8, x4, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x4, x6;                     \
-        adcs    x15, x15, x8;                   \
-        umulh   x8, x4, x7;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x5, x7;                     \
-        adcs    x17, x17, x8;                   \
-        adc     x19, x19, xzr;                  \
-        adds    x9, x9, x9;                     \
-        adcs    x10, x10, x10;                  \
-        adcs    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adcs    x17, x17, x17;                  \
-        adcs    x19, x19, x19;                  \
-        cset    x20, hs;                        \
-        umulh   x8, x2, x2;                     \
-        mul     x2, x2, x2;                     \
-        adds    x9, x9, x8;                     \
-        mul     x8, x3, x3;                     \
-        adcs    x10, x10, x8;                   \
-        umulh   x8, x3, x3;                     \
-        adcs    x11, x11, x8;                   \
-        mul     x8, x4, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x4, x4;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x5, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x5, x5;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x6, x6;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x6, x6;                     \
-        adcs    x17, x17, x8;                   \
-        mul     x8, x7, x7;                     \
-        adcs    x19, x19, x8;                   \
-        umulh   x8, x7, x7;                     \
-        adc     x20, x20, x8;                   \
-        lsl     x5, x2, #32;                    \
-        add     x2, x5, x2;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x2;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x2;                     \
-        umulh   x4, x4, x2;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x2;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x9, x9, x5;                     \
-        sbcs    x10, x10, x4;                   \
-        sbcs    x11, x11, x3;                   \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x2, x2, xzr;                    \
-        lsl     x5, x9, #32;                    \
-        add     x9, x5, x9;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x9;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x9;                     \
-        umulh   x4, x4, x9;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x9;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x10, x10, x5;                   \
-        sbcs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x2, x2, xzr;                    \
-        sbc     x9, x9, xzr;                    \
-        lsl     x5, x10, #32;                   \
-        add     x10, x5, x10;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x10;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x10;                    \
-        umulh   x4, x4, x10;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x10;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x11, x11, x5;                   \
-        sbcs    x12, x12, x4;                   \
-        sbcs    x13, x13, x3;                   \
-        sbcs    x2, x2, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        lsl     x5, x11, #32;                   \
-        add     x11, x5, x11;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x11;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x11;                    \
-        umulh   x4, x4, x11;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x11;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x12, x12, x5;                   \
-        sbcs    x13, x13, x4;                   \
-        sbcs    x2, x2, x3;                     \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbc     x11, x11, xzr;                  \
-        lsl     x5, x12, #32;                   \
-        add     x12, x5, x12;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x12;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x12;                    \
-        umulh   x4, x4, x12;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x12;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x13, x13, x5;                   \
-        sbcs    x2, x2, x4;                     \
-        sbcs    x9, x9, x3;                     \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbc     x12, x12, xzr;                  \
-        lsl     x5, x13, #32;                   \
-        add     x13, x5, x13;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x13;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x13;                    \
-        umulh   x4, x4, x13;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x13;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x2, x2, x5;                     \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        adds    x2, x2, x14;                    \
-        adcs    x9, x9, x15;                    \
-        adcs    x10, x10, x16;                  \
-        adcs    x11, x11, x17;                  \
-        adcs    x12, x12, x19;                  \
-        adcs    x13, x13, x20;                  \
-        adc     x6, xzr, xzr;                   \
-        mov     x8, #-4294967295;               \
-        adds    x14, x2, x8;                    \
-        mov     x8, #4294967295;                \
-        adcs    x15, x9, x8;                    \
-        mov     x8, #1;                         \
-        adcs    x16, x10, x8;                   \
-        adcs    x17, x11, xzr;                  \
-        adcs    x19, x12, xzr;                  \
-        adcs    x20, x13, xzr;                  \
-        adcs    x6, x6, xzr;                    \
-        csel    x2, x2, x14, eq;                \
-        csel    x9, x9, x15, eq;                \
-        csel    x10, x10, x16, eq;              \
-        csel    x11, x11, x17, eq;              \
-        csel    x12, x12, x19, eq;              \
-        csel    x13, x13, x20, eq;              \
-        stp     x2, x9, [P0];                   \
-        stp     x10, x11, [P0+16];              \
-        stp     x12, x13, [P0+32]
-
-// Almost-Montgomery variant which we use when an input to other muls
-// with the other argument fully reduced (which is always safe). In
-// fact, with the Karatsuba-based Montgomery mul here, we don't even
-// *need* the restriction that the other argument is reduced.
-
-#define amontsqr_p384(P0,P1)                    \
-        ldp     x2, x3, [P1];                   \
-        mul     x9, x2, x3;                     \
-        umulh   x10, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x8, x2, x4;                     \
-        adds    x10, x10, x8;                   \
-        mul     x11, x2, x5;                    \
-        mul     x8, x3, x4;                     \
-        adcs    x11, x11, x8;                   \
-        umulh   x12, x2, x5;                    \
-        mul     x8, x3, x5;                     \
-        adcs    x12, x12, x8;                   \
-        ldp     x6, x7, [P1+32];                \
-        mul     x13, x2, x7;                    \
-        mul     x8, x3, x6;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x14, x2, x7;                    \
-        mul     x8, x3, x7;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x15, x5, x6;                    \
-        adcs    x15, x15, xzr;                  \
-        umulh   x16, x5, x6;                    \
-        adc     x16, x16, xzr;                  \
-        umulh   x8, x2, x4;                     \
-        adds    x11, x11, x8;                   \
-        umulh   x8, x3, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x3, x5;                     \
-        adcs    x13, x13, x8;                   \
-        umulh   x8, x3, x6;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x3, x7;                     \
-        adcs    x15, x15, x8;                   \
-        adc     x16, x16, xzr;                  \
-        mul     x8, x2, x6;                     \
-        adds    x12, x12, x8;                   \
-        mul     x8, x4, x5;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x4, x6;                     \
-        adcs    x14, x14, x8;                   \
-        mul     x8, x4, x7;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x5, x7;                     \
-        adcs    x16, x16, x8;                   \
-        mul     x17, x6, x7;                    \
-        adcs    x17, x17, xzr;                  \
-        umulh   x19, x6, x7;                    \
-        adc     x19, x19, xzr;                  \
-        umulh   x8, x2, x6;                     \
-        adds    x13, x13, x8;                   \
-        umulh   x8, x4, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x4, x6;                     \
-        adcs    x15, x15, x8;                   \
-        umulh   x8, x4, x7;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x5, x7;                     \
-        adcs    x17, x17, x8;                   \
-        adc     x19, x19, xzr;                  \
-        adds    x9, x9, x9;                     \
-        adcs    x10, x10, x10;                  \
-        adcs    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adcs    x17, x17, x17;                  \
-        adcs    x19, x19, x19;                  \
-        cset    x20, hs;                        \
-        umulh   x8, x2, x2;                     \
-        mul     x2, x2, x2;                     \
-        adds    x9, x9, x8;                     \
-        mul     x8, x3, x3;                     \
-        adcs    x10, x10, x8;                   \
-        umulh   x8, x3, x3;                     \
-        adcs    x11, x11, x8;                   \
-        mul     x8, x4, x4;                     \
-        adcs    x12, x12, x8;                   \
-        umulh   x8, x4, x4;                     \
-        adcs    x13, x13, x8;                   \
-        mul     x8, x5, x5;                     \
-        adcs    x14, x14, x8;                   \
-        umulh   x8, x5, x5;                     \
-        adcs    x15, x15, x8;                   \
-        mul     x8, x6, x6;                     \
-        adcs    x16, x16, x8;                   \
-        umulh   x8, x6, x6;                     \
-        adcs    x17, x17, x8;                   \
-        mul     x8, x7, x7;                     \
-        adcs    x19, x19, x8;                   \
-        umulh   x8, x7, x7;                     \
-        adc     x20, x20, x8;                   \
-        lsl     x5, x2, #32;                    \
-        add     x2, x5, x2;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x2;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x2;                     \
-        umulh   x4, x4, x2;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x2;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x9, x9, x5;                     \
-        sbcs    x10, x10, x4;                   \
-        sbcs    x11, x11, x3;                   \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbc     x2, x2, xzr;                    \
-        lsl     x5, x9, #32;                    \
-        add     x9, x5, x9;                     \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x9;                     \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x9;                     \
-        umulh   x4, x4, x9;                     \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x9;                     \
-        adc     x3, xzr, xzr;                   \
-        subs    x10, x10, x5;                   \
-        sbcs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x2, x2, xzr;                    \
-        sbc     x9, x9, xzr;                    \
-        lsl     x5, x10, #32;                   \
-        add     x10, x5, x10;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x10;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x10;                    \
-        umulh   x4, x4, x10;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x10;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x11, x11, x5;                   \
-        sbcs    x12, x12, x4;                   \
-        sbcs    x13, x13, x3;                   \
-        sbcs    x2, x2, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        lsl     x5, x11, #32;                   \
-        add     x11, x5, x11;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x11;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x11;                    \
-        umulh   x4, x4, x11;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x11;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x12, x12, x5;                   \
-        sbcs    x13, x13, x4;                   \
-        sbcs    x2, x2, x3;                     \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbc     x11, x11, xzr;                  \
-        lsl     x5, x12, #32;                   \
-        add     x12, x5, x12;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x12;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x12;                    \
-        umulh   x4, x4, x12;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x12;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x13, x13, x5;                   \
-        sbcs    x2, x2, x4;                     \
-        sbcs    x9, x9, x3;                     \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbc     x12, x12, xzr;                  \
-        lsl     x5, x13, #32;                   \
-        add     x13, x5, x13;                   \
-        mov     x5, #-4294967295;               \
-        umulh   x5, x5, x13;                    \
-        mov     x4, #4294967295;                \
-        mul     x3, x4, x13;                    \
-        umulh   x4, x4, x13;                    \
-        adds    x5, x5, x3;                     \
-        adcs    x4, x4, x13;                    \
-        adc     x3, xzr, xzr;                   \
-        subs    x2, x2, x5;                     \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        adds    x2, x2, x14;                    \
-        adcs    x9, x9, x15;                    \
-        adcs    x10, x10, x16;                  \
-        adcs    x11, x11, x17;                  \
-        adcs    x12, x12, x19;                  \
-        adcs    x13, x13, x20;                  \
-        mov     x14, #-4294967295;              \
-        mov     x15, #4294967295;               \
-        csel    x14, x14, xzr, cs;              \
-        csel    x15, x15, xzr, cs;              \
-        cset    x16, cs;                        \
-        adds    x2, x2, x14;                    \
-        adcs    x9, x9, x15;                    \
-        adcs    x10, x10, x16;                  \
-        adcs    x11, x11, xzr;                  \
-        adcs    x12, x12, xzr;                  \
-        adc     x13, x13, xzr;                  \
-        stp     x2, x9, [P0];                   \
-        stp     x10, x11, [P0+16];              \
-        stp     x12, x13, [P0+32]
-
-// Corresponds exactly to bignum_sub_p384
-
-#define sub_p384(P0,P1,P2)                      \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        csetm   x3, lo;                         \
-        mov     x4, #4294967295;                \
-        and     x4, x4, x3;                     \
-        adds    x5, x5, x4;                     \
-        eor     x4, x4, x3;                     \
-        adcs    x6, x6, x4;                     \
-        mov     x4, #-2;                        \
-        and     x4, x4, x3;                     \
-        adcs    x7, x7, x4;                     \
-        adcs    x8, x8, x3;                     \
-        adcs    x9, x9, x3;                     \
-        adc     x10, x10, x3;                   \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32]
-
-S2N_BN_SYMBOL(p384_montjmixadd_alt):
-
-// Save regs and make room on stack for temporary variables
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x26, [sp, #-16]!
-        sub     sp, sp, NSPACE
-
-// Move the input arguments to stable places
-
-        mov     input_z, x0
-        mov     input_x, x1
-        mov     input_y, x2
-
-// Main code, just a sequence of basic field operations
-// 8 * multiply + 3 * square + 7 * subtract
-
-        amontsqr_p384(zp2,z_1)
-        montmul_p384(y2a,z_1,y_2)
-
-        montmul_p384(x2a,zp2,x_2)
-        montmul_p384(y2a,zp2,y2a)
-
-        sub_p384(xd,x2a,x_1)
-        sub_p384(yd,y2a,y_1)
-
-        amontsqr_p384(zz,xd)
-        montsqr_p384(ww,yd)
-
-        montmul_p384(zzx1,zz,x_1)
-        montmul_p384(zzx2,zz,x2a)
-
-        sub_p384(resx,ww,zzx1)
-        sub_p384(t1,zzx2,zzx1)
-
-        montmul_p384(resz,xd,z_1)
-
-        sub_p384(resx,resx,zzx2)
-
-        sub_p384(t2,zzx1,resx)
-
-        montmul_p384(t1,t1,y_1)
-        montmul_p384(t2,yd,t2)
-
-        sub_p384(resy,t2,t1)
-
-// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
-
-        ldp     x0, x1, [z_1]
-        ldp     x2, x3, [z_1+16]
-        ldp     x4, x5, [z_1+32]
-        orr     x6, x0, x1
-        orr     x7, x2, x3
-        orr     x8, x4, x5
-        orr     x6, x6, x7
-        orr     x6, x6, x8
-        cmp     x6, xzr
-
-// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
-// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
-// Montgomery form so not the simple constant 1 but rather 2^384 - p_384),
-// hence giving 0 + p2 = p2 for the final result.
-
-        ldp     x0, x1, [resx]
-        ldp     x19, x20, [x_2]
-        csel    x0, x0, x19, ne
-        csel    x1, x1, x20, ne
-        ldp     x2, x3, [resx+16]
-        ldp     x19, x20, [x_2+16]
-        csel    x2, x2, x19, ne
-        csel    x3, x3, x20, ne
-        ldp     x4, x5, [resx+32]
-        ldp     x19, x20, [x_2+32]
-        csel    x4, x4, x19, ne
-        csel    x5, x5, x20, ne
-
-        ldp     x6, x7, [resy]
-        ldp     x19, x20, [y_2]
-        csel    x6, x6, x19, ne
-        csel    x7, x7, x20, ne
-        ldp     x8, x9, [resy+16]
-        ldp     x19, x20, [y_2+16]
-        csel    x8, x8, x19, ne
-        csel    x9, x9, x20, ne
-        ldp     x10, x11, [resy+32]
-        ldp     x19, x20, [y_2+32]
-        csel    x10, x10, x19, ne
-        csel    x11, x11, x20, ne
-
-        ldp     x12, x13, [resz]
-        mov     x19, #0xffffffff00000001
-        mov     x20, #0x00000000ffffffff
-        csel    x12, x12, x19, ne
-        csel    x13, x13, x20, ne
-        ldp     x14, x15, [resz+16]
-        mov     x19, #1
-        csel    x14, x14, x19, ne
-        csel    x15, x15, xzr, ne
-        ldp     x16, x17, [resz+32]
-        csel    x16, x16, xzr, ne
-        csel    x17, x17, xzr, ne
-
-        stp     x0, x1, [x_3]
-        stp     x2, x3, [x_3+16]
-        stp     x4, x5, [x_3+32]
-        stp     x6, x7, [y_3]
-        stp     x8, x9, [y_3+16]
-        stp     x10, x11, [y_3+32]
-        stp     x12, x13, [z_3]
-        stp     x14, x15, [z_3+16]
-        stp     x16, x17, [z_3+32]
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-
-        ldp     x25, x26, [sp], 16
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S b/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S
deleted file mode 100644
index 2bd405e2454..00000000000
--- a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S
+++ /dev/null
@@ -1,9988 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Montgomery-Jacobian form scalar multiplication for P-384
-// Input scalar[6], point[18]; output res[18]
-//
-// extern void p384_montjscalarmul
-//   (uint64_t res[static 18],
-//    uint64_t scalar[static 6],
-//    uint64_t point[static 18]);
-//
-// This function is a variant of its affine point version p384_scalarmul.
-// Here, input and output points are assumed to be in Jacobian form with
-// their coordinates in the Montgomery domain. Thus, if priming indicates
-// Montgomery form, x' = (2^384 * x) mod p_384 etc., each point argument
-// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when
-// z' is nonzero or the point at infinity (group identity) if z' = 0.
-//
-// Given scalar = n and point = P, assumed to be on the NIST elliptic
-// curve P-384, returns a representation of n * P. If the result is the
-// point at infinity (either because the input point was or because the
-// scalar was a multiple of p_384) then the output is guaranteed to
-// represent the point at infinity, i.e. to have its z coordinate zero.
-//
-// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point
-// ----------------------------------------------------------------------------
-
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjscalarmul)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjscalarmul)
-
-        .text
-        .balign 4
-
-// Size of individual field elements
-
-#define NUMSIZE 48
-#define JACSIZE (3*NUMSIZE)
-
-// Safe copies of input res and additional values in variables.
-
-#define bf x22
-#define sgn x23
-#define j x24
-#define res x25
-
-// Intermediate variables on the stack.
-// The table is 16 entries, each of size JACSIZE = 3 * NUMSIZE
-
-#define scalarb sp, #(0*NUMSIZE)
-#define acc sp, #(1*NUMSIZE)
-#define tabent sp, #(4*NUMSIZE)
-
-#define tab sp, #(7*NUMSIZE)
-
-#define NSPACE #(55*NUMSIZE)
-
-// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator,
-// which doesn't accept repetitions, assembler macros etc.
-
-#define selectblock(I)                            \
-        cmp     bf, #(1*I);                       \
-        ldp     x20, x21, [x19];                  \
-        csel    x0, x20, x0, eq;                  \
-        csel    x1, x21, x1, eq;                  \
-        ldp     x20, x21, [x19, #16];             \
-        csel    x2, x20, x2, eq;                  \
-        csel    x3, x21, x3, eq;                  \
-        ldp     x20, x21, [x19, #32];             \
-        csel    x4, x20, x4, eq;                  \
-        csel    x5, x21, x5, eq;                  \
-        ldp     x20, x21, [x19, #48];             \
-        csel    x6, x20, x6, eq;                  \
-        csel    x7, x21, x7, eq;                  \
-        ldp     x20, x21, [x19, #64];             \
-        csel    x8, x20, x8, eq;                  \
-        csel    x9, x21, x9, eq;                  \
-        ldp     x20, x21, [x19, #80];             \
-        csel    x10, x20, x10, eq;                \
-        csel    x11, x21, x11, eq;                \
-        ldp     x20, x21, [x19, #96];             \
-        csel    x12, x20, x12, eq;                \
-        csel    x13, x21, x13, eq;                \
-        ldp     x20, x21, [x19, #112];            \
-        csel    x14, x20, x14, eq;                \
-        csel    x15, x21, x15, eq;                \
-        ldp     x20, x21, [x19, #128];            \
-        csel    x16, x20, x16, eq;                \
-        csel    x17, x21, x17, eq;                \
-        add     x19, x19, #JACSIZE
-
-// Loading large constants
-
-#define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
-        movk    nn, n3, lsl #48
-
-S2N_BN_SYMBOL(p384_montjscalarmul):
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x30, [sp, #-16]!
-        sub     sp, sp, NSPACE
-
-// Preserve the "res" input argument; others get processed early.
-
-        mov     res, x0
-
-// Reduce the input scalar mod n_384, i.e. conditionally subtract n_384.
-// Store it to "scalarb".
-
-        ldp     x3, x4, [x1]
-        movbig(x15, #0xecec, #0x196a, #0xccc5, #0x2973)
-        ldp     x5, x6, [x1, #16]
-        movbig(x16, #0x581a, #0x0db2, #0x48b0, #0xa77a)
-        ldp     x7, x8, [x1, #32]
-        movbig(x17, #0xc763, #0x4d81, #0xf437, #0x2ddf)
-
-        subs    x9, x3, x15
-        sbcs    x10, x4, x16
-        sbcs    x11, x5, x17
-        adcs    x12, x6, xzr
-        adcs    x13, x7, xzr
-        adcs    x14, x8, xzr
-
-        csel    x3, x3, x9, cc
-        csel    x4, x4, x10, cc
-        csel    x5, x5, x11, cc
-        csel    x6, x6, x12, cc
-        csel    x7, x7, x13, cc
-        csel    x8, x8, x14, cc
-
-        stp     x3, x4, [scalarb]
-        stp     x5, x6, [scalarb+16]
-        stp     x7, x8, [scalarb+32]
-
-// Set the tab[0] table entry to the input point = 1 * P
-
-        ldp     x10, x11, [x2]
-        stp     x10, x11, [tab]
-        ldp     x12, x13, [x2, #16]
-        stp     x12, x13, [tab+16]
-        ldp     x14, x15, [x2, #32]
-        stp     x14, x15, [tab+32]
-
-        ldp     x10, x11, [x2, #48]
-        stp     x10, x11, [tab+48]
-        ldp     x12, x13, [x2, #64]
-        stp     x12, x13, [tab+64]
-        ldp     x14, x15, [x2, #80]
-        stp     x14, x15, [tab+80]
-
-        ldp     x10, x11, [x2, #96]
-        stp     x10, x11, [tab+96]
-        ldp     x12, x13, [x2, #112]
-        stp     x12, x13, [tab+112]
-        ldp     x14, x15, [x2, #128]
-        stp     x14, x15, [tab+128]
-
-// Compute and record tab[1] = 2 * p, ..., tab[15] = 16 * P
-
-        add     x0, tab+JACSIZE*1
-        add     x1, tab
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, tab+JACSIZE*2
-        add     x1, tab+JACSIZE*1
-        add     x2, tab
-        bl      p384_montjscalarmul_p384_montjadd
-
-        add     x0, tab+JACSIZE*3
-        add     x1, tab+JACSIZE*1
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, tab+JACSIZE*4
-        add     x1, tab+JACSIZE*3
-        add     x2, tab
-        bl      p384_montjscalarmul_p384_montjadd
-
-        add     x0, tab+JACSIZE*5
-        add     x1, tab+JACSIZE*2
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, tab+JACSIZE*6
-        add     x1, tab+JACSIZE*5
-        add     x2, tab
-        bl      p384_montjscalarmul_p384_montjadd
-
-        add     x0, tab+JACSIZE*7
-        add     x1, tab+JACSIZE*3
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, tab+JACSIZE*8
-        add     x1, tab+JACSIZE*7
-        add     x2, tab
-        bl      p384_montjscalarmul_p384_montjadd
-
-        add     x0, tab+JACSIZE*9
-        add     x1, tab+JACSIZE*4
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, tab+JACSIZE*10
-        add     x1, tab+JACSIZE*9
-        add     x2, tab
-        bl      p384_montjscalarmul_p384_montjadd
-
-        add     x0, tab+JACSIZE*11
-        add     x1, tab+JACSIZE*5
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, tab+JACSIZE*12
-        add     x1, tab+JACSIZE*11
-        add     x2, tab
-        bl      p384_montjscalarmul_p384_montjadd
-
-        add     x0, tab+JACSIZE*13
-        add     x1, tab+JACSIZE*6
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, tab+JACSIZE*14
-        add     x1, tab+JACSIZE*13
-        add     x2, tab
-        bl      p384_montjscalarmul_p384_montjadd
-
-        add     x0, tab+JACSIZE*15
-        add     x1, tab+JACSIZE*7
-        bl      p384_montjscalarmul_p384_montjdouble
-
-// Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed
-// digits. The digits of the constant, in lowest-to-highest order, are as
-// follows; they are generated dynamically since none is a simple ARM load.
-//
-// 0x0842108421084210
-// 0x1084210842108421
-// 0x2108421084210842
-// 0x4210842108421084
-// 0x8421084210842108
-// 0x0842108421084210
-
-        ldp     x0, x1, [scalarb]
-        ldp     x2, x3, [scalarb+16]
-        ldp     x4, x5, [scalarb+32]
-        movbig(x8, #0x1084, #0x2108, #0x4210, #0x8421)
-        adds    x0, x0, x8, lsr #1
-        adcs    x1, x1, x8
-        lsl     x8, x8, #1
-        adcs    x2, x2, x8
-        lsl     x8, x8, #1
-        adcs    x3, x3, x8
-        lsl     x8, x8, #1
-        adcs    x4, x4, x8
-        lsr     x8, x8, #4
-        adcs    x5, x5, x8
-        cset    x6, cs
-
-// Record the top bitfield then shift the whole scalar left 4 bits
-// to align the top of the next bitfield with the MSB (bits 379..383).
-
-        extr    bf, x6, x5, #60
-        extr    x5, x5, x4, #60
-        extr    x4, x4, x3, #60
-        extr    x3, x3, x2, #60
-        extr    x2, x2, x1, #60
-        extr    x1, x1, x0, #60
-        lsl     x0, x0, #4
-        stp     x0, x1, [scalarb]
-        stp     x2, x3, [scalarb+16]
-        stp     x4, x5, [scalarb+32]
-
-// Initialize the accumulator to the corresponding entry using constant-time
-// lookup in the table. This top digit, uniquely, is not recoded so there is
-// no sign adjustment to make.
-
-        mov     x0, xzr
-        mov     x1, xzr
-        mov     x2, xzr
-        mov     x3, xzr
-        mov     x4, xzr
-        mov     x5, xzr
-        mov     x6, xzr
-        mov     x7, xzr
-        mov     x8, xzr
-        mov     x9, xzr
-        mov     x10, xzr
-        mov     x11, xzr
-        mov     x12, xzr
-        mov     x13, xzr
-        mov     x14, xzr
-        mov     x15, xzr
-        mov     x16, xzr
-        mov     x17, xzr
-
-        add     x19, tab
-
-        selectblock(1)
-        selectblock(2)
-        selectblock(3)
-        selectblock(4)
-        selectblock(5)
-        selectblock(6)
-        selectblock(7)
-        selectblock(8)
-        selectblock(9)
-        selectblock(10)
-        selectblock(11)
-        selectblock(12)
-        selectblock(13)
-        selectblock(14)
-        selectblock(15)
-        selectblock(16)
-
-        stp     x0, x1, [acc]
-        stp     x2, x3, [acc+16]
-        stp     x4, x5, [acc+32]
-        stp     x6, x7, [acc+48]
-        stp     x8, x9, [acc+64]
-        stp     x10, x11, [acc+80]
-        stp     x12, x13, [acc+96]
-        stp     x14, x15, [acc+112]
-        stp     x16, x17, [acc+128]
-
-        mov     j, #380
-
-// Main loop over size-5 bitfields: double 5 times then add signed digit
-// At each stage we shift the scalar left by 5 bits so we can simply pick
-// the top 5 bits as the bitfield, saving some fiddle over indexing.
-
-p384_montjscalarmul_mainloop:
-        sub     j, j, #5
-
-        add     x0, acc
-        add     x1, acc
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, acc
-        add     x1, acc
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, acc
-        add     x1, acc
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, acc
-        add     x1, acc
-        bl      p384_montjscalarmul_p384_montjdouble
-
-        add     x0, acc
-        add     x1, acc
-        bl      p384_montjscalarmul_p384_montjdouble
-
-// Choose the bitfield and adjust it to sign and magnitude
-
-        ldp     x0, x1, [scalarb]
-        ldp     x2, x3, [scalarb+16]
-        ldp     x4, x5, [scalarb+32]
-        lsr     bf, x5, #59
-        extr    x5, x5, x4, #59
-        extr    x4, x4, x3, #59
-        extr    x3, x3, x2, #59
-        extr    x2, x2, x1, #59
-        extr    x1, x1, x0, #59
-        lsl     x0, x0, #5
-        stp     x0, x1, [scalarb]
-        stp     x2, x3, [scalarb+16]
-        stp     x4, x5, [scalarb+32]
-
-        subs    bf, bf, #16
-        cset    sgn, lo                 // sgn = sign of digit (1 = negative)
-        cneg    bf, bf, lo              // bf = absolute value of digit
-
-// Conditionally select the table entry tab[i-1] = i * P in constant time
-
-        mov     x0, xzr
-        mov     x1, xzr
-        mov     x2, xzr
-        mov     x3, xzr
-        mov     x4, xzr
-        mov     x5, xzr
-        mov     x6, xzr
-        mov     x7, xzr
-        mov     x8, xzr
-        mov     x9, xzr
-        mov     x10, xzr
-        mov     x11, xzr
-        mov     x12, xzr
-        mov     x13, xzr
-        mov     x14, xzr
-        mov     x15, xzr
-        mov     x16, xzr
-        mov     x17, xzr
-
-        add     x19, tab
-
-        selectblock(1)
-        selectblock(2)
-        selectblock(3)
-        selectblock(4)
-        selectblock(5)
-        selectblock(6)
-        selectblock(7)
-        selectblock(8)
-        selectblock(9)
-        selectblock(10)
-        selectblock(11)
-        selectblock(12)
-        selectblock(13)
-        selectblock(14)
-        selectblock(15)
-        selectblock(16)
-
-// Store it to "tabent" with the y coordinate optionally negated.
-// This is done carefully to give coordinates < p_384 even in
-// the degenerate case y = 0 (when z = 0 for points on the curve).
-
-        stp     x0, x1, [tabent]
-        stp     x2, x3, [tabent+16]
-        stp     x4, x5, [tabent+32]
-
-        stp     x12, x13, [tabent+96]
-        stp     x14, x15, [tabent+112]
-        stp     x16, x17, [tabent+128]
-
-        mov     x0, #0x00000000ffffffff
-        subs    x0, x0, x6
-        orr     x12, x6, x7
-        mov     x1, #0xffffffff00000000
-        sbcs    x1, x1, x7
-        orr     x13, x8, x9
-        mov     x2, #0xfffffffffffffffe
-        sbcs    x2, x2, x8
-        orr     x14, x10, x11
-        mov     x5, #0xffffffffffffffff
-        sbcs    x3, x5, x9
-        orr     x12, x12, x13
-        sbcs    x4, x5, x10
-        orr     x12, x12, x14
-        sbcs    x5, x5, x11
-
-        cmp     sgn, xzr
-        ccmp    x12, xzr, #4, ne
-
-        csel    x6, x0, x6, ne
-        csel    x7, x1, x7, ne
-        csel    x8, x2, x8, ne
-        csel    x9, x3, x9, ne
-        csel    x10, x4, x10, ne
-        csel    x11, x5, x11, ne
-
-        stp     x6, x7, [tabent+48]
-        stp     x8, x9, [tabent+64]
-        stp     x10, x11, [tabent+80]
-
-// Add to the accumulator
-
-        add     x0, acc
-        add     x1, acc
-        add     x2, tabent
-        bl      p384_montjscalarmul_p384_montjadd
-
-        cbnz    j, p384_montjscalarmul_mainloop
-
-// That's the end of the main loop, and we just need to copy the
-// result in "acc" to the output.
-
-        ldp     x0, x1, [acc]
-        stp     x0, x1, [res]
-        ldp     x0, x1, [acc+16]
-        stp     x0, x1, [res, #16]
-        ldp     x0, x1, [acc+32]
-        stp     x0, x1, [res, #32]
-        ldp     x0, x1, [acc+48]
-        stp     x0, x1, [res, #48]
-        ldp     x0, x1, [acc+64]
-        stp     x0, x1, [res, #64]
-        ldp     x0, x1, [acc+80]
-        stp     x0, x1, [res, #80]
-        ldp     x0, x1, [acc+96]
-        stp     x0, x1, [res, #96]
-        ldp     x0, x1, [acc+112]
-        stp     x0, x1, [res, #112]
-        ldp     x0, x1, [acc+128]
-        stp     x0, x1, [res, #128]
-
-// Restore stack and registers and return
-
-        add     sp, sp, NSPACE
-        ldp     x25, x30, [sp], 16
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-        ret
-
-// Local copies of subroutines, complete clones at the moment
-
-p384_montjscalarmul_p384_montjadd:
-        stp	x19, x20, [sp, #-16]!
-        stp	x21, x22, [sp, #-16]!
-        stp	x23, x24, [sp, #-16]!
-        stp	x25, x26, [sp, #-16]!
-        stp	x27, xzr, [sp, #-16]!
-        sub	sp, sp, #0x180
-        mov	x24, x0
-        mov	x25, x1
-        mov	x26, x2
-        mov	x0, sp
-        ldr	q1, [x25, #96]
-        ldp	x9, x2, [x25, #96]
-        ldr	q0, [x25, #96]
-        ldp	x4, x6, [x25, #112]
-        rev64	v21.4s, v1.4s
-        uzp2	v28.4s, v1.4s, v1.4s
-        umulh	x7, x9, x2
-        xtn	v17.2s, v1.2d
-        mul	v27.4s, v21.4s, v0.4s
-        ldr	q20, [x25, #128]
-        xtn	v30.2s, v0.2d
-        ldr	q1, [x25, #128]
-        uzp2	v31.4s, v0.4s, v0.4s
-        ldp	x5, x10, [x25, #128]
-        umulh	x8, x9, x4
-        uaddlp	v3.2d, v27.4s
-        umull	v16.2d, v30.2s, v17.2s
-        mul	x16, x9, x4
-        umull	v27.2d, v30.2s, v28.2s
-        shrn	v0.2s, v20.2d, #32
-        xtn	v7.2s, v20.2d
-        shl	v20.2d, v3.2d, #32
-        umull	v3.2d, v31.2s, v28.2s
-        mul	x3, x2, x4
-        umlal	v20.2d, v30.2s, v17.2s
-        umull	v22.2d, v7.2s, v0.2s
-        usra	v27.2d, v16.2d, #32
-        umulh	x11, x2, x4
-        movi	v21.2d, #0xffffffff
-        uzp2	v28.4s, v1.4s, v1.4s
-        adds	x15, x16, x7
-        and	v5.16b, v27.16b, v21.16b
-        adcs	x3, x3, x8
-        usra	v3.2d, v27.2d, #32
-        dup	v29.2d, x6
-        adcs	x16, x11, xzr
-        mov	x14, v20.d[0]
-        umlal	v5.2d, v31.2s, v17.2s
-        mul	x8, x9, x2
-        mov	x7, v20.d[1]
-        shl	v19.2d, v22.2d, #33
-        xtn	v25.2s, v29.2d
-        rev64	v31.4s, v1.4s
-        lsl	x13, x14, #32
-        uzp2	v6.4s, v29.4s, v29.4s
-        umlal	v19.2d, v7.2s, v7.2s
-        usra	v3.2d, v5.2d, #32
-        adds	x1, x8, x8
-        umulh	x8, x4, x4
-        add	x12, x13, x14
-        mul	v17.4s, v31.4s, v29.4s
-        xtn	v4.2s, v1.2d
-        adcs	x14, x15, x15
-        lsr	x13, x12, #32
-        adcs	x15, x3, x3
-        umull	v31.2d, v25.2s, v28.2s
-        adcs	x11, x16, x16
-        umull	v21.2d, v25.2s, v4.2s
-        mov	x17, v3.d[0]
-        umull	v18.2d, v6.2s, v28.2s
-        adc	x16, x8, xzr
-        uaddlp	v16.2d, v17.4s
-        movi	v1.2d, #0xffffffff
-        subs	x13, x13, x12
-        usra	v31.2d, v21.2d, #32
-        sbc	x8, x12, xzr
-        adds	x17, x17, x1
-        mul	x1, x4, x4
-        shl	v28.2d, v16.2d, #32
-        mov	x3, v3.d[1]
-        adcs	x14, x7, x14
-        extr	x7, x8, x13, #32
-        adcs	x13, x3, x15
-        and	v3.16b, v31.16b, v1.16b
-        adcs	x11, x1, x11
-        lsr	x1, x8, #32
-        umlal	v3.2d, v6.2s, v4.2s
-        usra	v18.2d, v31.2d, #32
-        adc	x3, x16, xzr
-        adds	x1, x1, x12
-        umlal	v28.2d, v25.2s, v4.2s
-        adc	x16, xzr, xzr
-        subs	x15, x17, x7
-        sbcs	x7, x14, x1
-        lsl	x1, x15, #32
-        sbcs	x16, x13, x16
-        add	x8, x1, x15
-        usra	v18.2d, v3.2d, #32
-        sbcs	x14, x11, xzr
-        lsr	x1, x8, #32
-        sbcs	x17, x3, xzr
-        sbc	x11, x12, xzr
-        subs	x13, x1, x8
-        umulh	x12, x4, x10
-        sbc	x1, x8, xzr
-        extr	x13, x1, x13, #32
-        lsr	x1, x1, #32
-        adds	x15, x1, x8
-        adc	x1, xzr, xzr
-        subs	x7, x7, x13
-        sbcs	x13, x16, x15
-        lsl	x3, x7, #32
-        umulh	x16, x2, x5
-        sbcs	x15, x14, x1
-        add	x7, x3, x7
-        sbcs	x3, x17, xzr
-        lsr	x1, x7, #32
-        sbcs	x14, x11, xzr
-        sbc	x11, x8, xzr
-        subs	x8, x1, x7
-        sbc	x1, x7, xzr
-        extr	x8, x1, x8, #32
-        lsr	x1, x1, #32
-        adds	x1, x1, x7
-        adc	x17, xzr, xzr
-        subs	x13, x13, x8
-        umulh	x8, x9, x6
-        sbcs	x1, x15, x1
-        sbcs	x15, x3, x17
-        sbcs	x3, x14, xzr
-        mul	x17, x2, x5
-        sbcs	x11, x11, xzr
-        stp	x13, x1, [x0]
-        sbc	x14, x7, xzr
-        mul	x7, x4, x10
-        subs	x1, x9, x2
-        stp	x15, x3, [x0, #16]
-        csetm	x15, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        stp	x11, x14, [x0, #32]
-        mul	x14, x9, x6
-        adds	x17, x8, x17
-        adcs	x7, x16, x7
-        adc	x13, x12, xzr
-        subs	x12, x5, x6
-        cneg	x3, x12, cc  // cc = lo, ul, last
-        cinv	x16, x15, cc  // cc = lo, ul, last
-        mul	x8, x1, x3
-        umulh	x1, x1, x3
-        eor	x12, x8, x16
-        adds	x11, x17, x14
-        adcs	x3, x7, x17
-        adcs	x15, x13, x7
-        adc	x8, x13, xzr
-        adds	x3, x3, x14
-        adcs	x15, x15, x17
-        adcs	x17, x8, x7
-        eor	x1, x1, x16
-        adc	x13, x13, xzr
-        subs	x9, x9, x4
-        csetm	x8, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x4, x2, x4
-        cneg	x4, x4, cc  // cc = lo, ul, last
-        csetm	x7, cc  // cc = lo, ul, last
-        subs	x2, x10, x6
-        cinv	x8, x8, cc  // cc = lo, ul, last
-        cneg	x2, x2, cc  // cc = lo, ul, last
-        cmn	x16, #0x1
-        adcs	x11, x11, x12
-        mul	x12, x9, x2
-        adcs	x3, x3, x1
-        adcs	x15, x15, x16
-        umulh	x9, x9, x2
-        adcs	x17, x17, x16
-        adc	x13, x13, x16
-        subs	x1, x10, x5
-        cinv	x2, x7, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        eor	x9, x9, x8
-        cmn	x8, #0x1
-        eor	x7, x12, x8
-        mul	x12, x4, x1
-        adcs	x3, x3, x7
-        adcs	x7, x15, x9
-        adcs	x15, x17, x8
-        ldp	x9, x17, [x0, #16]
-        umulh	x4, x4, x1
-        adc	x8, x13, x8
-        cmn	x2, #0x1
-        eor	x1, x12, x2
-        adcs	x1, x7, x1
-        ldp	x7, x16, [x0]
-        eor	x12, x4, x2
-        adcs	x4, x15, x12
-        ldp	x15, x12, [x0, #32]
-        adc	x8, x8, x2
-        adds	x13, x14, x14
-        umulh	x14, x5, x10
-        adcs	x2, x11, x11
-        adcs	x3, x3, x3
-        adcs	x1, x1, x1
-        adcs	x4, x4, x4
-        adcs	x11, x8, x8
-        adc	x8, xzr, xzr
-        adds	x13, x13, x7
-        adcs	x2, x2, x16
-        mul	x16, x5, x10
-        adcs	x3, x3, x9
-        adcs	x1, x1, x17
-        umulh	x5, x5, x5
-        lsl	x9, x13, #32
-        add	x9, x9, x13
-        adcs	x4, x4, x15
-        mov	x13, v28.d[1]
-        adcs	x15, x11, x12
-        lsr	x7, x9, #32
-        adc	x11, x8, xzr
-        subs	x7, x7, x9
-        umulh	x10, x10, x10
-        sbc	x17, x9, xzr
-        extr	x7, x17, x7, #32
-        lsr	x17, x17, #32
-        adds	x17, x17, x9
-        adc	x12, xzr, xzr
-        subs	x8, x2, x7
-        sbcs	x17, x3, x17
-        lsl	x7, x8, #32
-        sbcs	x2, x1, x12
-        add	x3, x7, x8
-        sbcs	x12, x4, xzr
-        lsr	x1, x3, #32
-        sbcs	x7, x15, xzr
-        sbc	x15, x9, xzr
-        subs	x1, x1, x3
-        sbc	x4, x3, xzr
-        lsr	x9, x4, #32
-        extr	x8, x4, x1, #32
-        adds	x9, x9, x3
-        adc	x4, xzr, xzr
-        subs	x1, x17, x8
-        lsl	x17, x1, #32
-        sbcs	x8, x2, x9
-        sbcs	x9, x12, x4
-        add	x17, x17, x1
-        mov	x1, v18.d[1]
-        lsr	x2, x17, #32
-        sbcs	x7, x7, xzr
-        mov	x12, v18.d[0]
-        sbcs	x15, x15, xzr
-        sbc	x3, x3, xzr
-        subs	x4, x2, x17
-        sbc	x2, x17, xzr
-        adds	x12, x13, x12
-        adcs	x16, x16, x1
-        lsr	x13, x2, #32
-        extr	x1, x2, x4, #32
-        adc	x2, x14, xzr
-        adds	x4, x13, x17
-        mul	x13, x6, x6
-        adc	x14, xzr, xzr
-        subs	x1, x8, x1
-        sbcs	x4, x9, x4
-        mov	x9, v28.d[0]
-        sbcs	x7, x7, x14
-        sbcs	x8, x15, xzr
-        sbcs	x3, x3, xzr
-        sbc	x14, x17, xzr
-        adds	x17, x9, x9
-        adcs	x12, x12, x12
-        mov	x15, v19.d[0]
-        adcs	x9, x16, x16
-        umulh	x6, x6, x6
-        adcs	x16, x2, x2
-        adc	x2, xzr, xzr
-        adds	x11, x11, x8
-        adcs	x3, x3, xzr
-        adcs	x14, x14, xzr
-        adcs	x8, xzr, xzr
-        adds	x13, x1, x13
-        mov	x1, v19.d[1]
-        adcs	x6, x4, x6
-        mov	x4, #0xffffffff            	// #4294967295
-        adcs	x15, x7, x15
-        adcs	x7, x11, x5
-        adcs	x1, x3, x1
-        adcs	x14, x14, x10
-        adc	x11, x8, xzr
-        adds	x6, x6, x17
-        adcs	x8, x15, x12
-        adcs	x3, x7, x9
-        adcs	x15, x1, x16
-        mov	x16, #0xffffffff00000001    	// #-4294967295
-        adcs	x14, x14, x2
-        mov	x2, #0x1                   	// #1
-        adc	x17, x11, xzr
-        cmn	x13, x16
-        adcs	xzr, x6, x4
-        adcs	xzr, x8, x2
-        adcs	xzr, x3, xzr
-        adcs	xzr, x15, xzr
-        adcs	xzr, x14, xzr
-        adc	x1, x17, xzr
-        neg	x9, x1
-        and	x1, x16, x9
-        adds	x11, x13, x1
-        and	x13, x4, x9
-        adcs	x5, x6, x13
-        and	x1, x2, x9
-        adcs	x7, x8, x1
-        stp	x11, x5, [x0]
-        adcs	x11, x3, xzr
-        adcs	x2, x15, xzr
-        stp	x7, x11, [x0, #16]
-        adc	x17, x14, xzr
-        stp	x2, x17, [x0, #32]
-        ldr	q1, [x26, #96]
-        ldp	x9, x2, [x26, #96]
-        ldr	q0, [x26, #96]
-        ldp	x4, x6, [x26, #112]
-        rev64	v21.4s, v1.4s
-        uzp2	v28.4s, v1.4s, v1.4s
-        umulh	x7, x9, x2
-        xtn	v17.2s, v1.2d
-        mul	v27.4s, v21.4s, v0.4s
-        ldr	q20, [x26, #128]
-        xtn	v30.2s, v0.2d
-        ldr	q1, [x26, #128]
-        uzp2	v31.4s, v0.4s, v0.4s
-        ldp	x5, x10, [x26, #128]
-        umulh	x8, x9, x4
-        uaddlp	v3.2d, v27.4s
-        umull	v16.2d, v30.2s, v17.2s
-        mul	x16, x9, x4
-        umull	v27.2d, v30.2s, v28.2s
-        shrn	v0.2s, v20.2d, #32
-        xtn	v7.2s, v20.2d
-        shl	v20.2d, v3.2d, #32
-        umull	v3.2d, v31.2s, v28.2s
-        mul	x3, x2, x4
-        umlal	v20.2d, v30.2s, v17.2s
-        umull	v22.2d, v7.2s, v0.2s
-        usra	v27.2d, v16.2d, #32
-        umulh	x11, x2, x4
-        movi	v21.2d, #0xffffffff
-        uzp2	v28.4s, v1.4s, v1.4s
-        adds	x15, x16, x7
-        and	v5.16b, v27.16b, v21.16b
-        adcs	x3, x3, x8
-        usra	v3.2d, v27.2d, #32
-        dup	v29.2d, x6
-        adcs	x16, x11, xzr
-        mov	x14, v20.d[0]
-        umlal	v5.2d, v31.2s, v17.2s
-        mul	x8, x9, x2
-        mov	x7, v20.d[1]
-        shl	v19.2d, v22.2d, #33
-        xtn	v25.2s, v29.2d
-        rev64	v31.4s, v1.4s
-        lsl	x13, x14, #32
-        uzp2	v6.4s, v29.4s, v29.4s
-        umlal	v19.2d, v7.2s, v7.2s
-        usra	v3.2d, v5.2d, #32
-        adds	x1, x8, x8
-        umulh	x8, x4, x4
-        add	x12, x13, x14
-        mul	v17.4s, v31.4s, v29.4s
-        xtn	v4.2s, v1.2d
-        adcs	x14, x15, x15
-        lsr	x13, x12, #32
-        adcs	x15, x3, x3
-        umull	v31.2d, v25.2s, v28.2s
-        adcs	x11, x16, x16
-        umull	v21.2d, v25.2s, v4.2s
-        mov	x17, v3.d[0]
-        umull	v18.2d, v6.2s, v28.2s
-        adc	x16, x8, xzr
-        uaddlp	v16.2d, v17.4s
-        movi	v1.2d, #0xffffffff
-        subs	x13, x13, x12
-        usra	v31.2d, v21.2d, #32
-        sbc	x8, x12, xzr
-        adds	x17, x17, x1
-        mul	x1, x4, x4
-        shl	v28.2d, v16.2d, #32
-        mov	x3, v3.d[1]
-        adcs	x14, x7, x14
-        extr	x7, x8, x13, #32
-        adcs	x13, x3, x15
-        and	v3.16b, v31.16b, v1.16b
-        adcs	x11, x1, x11
-        lsr	x1, x8, #32
-        umlal	v3.2d, v6.2s, v4.2s
-        usra	v18.2d, v31.2d, #32
-        adc	x3, x16, xzr
-        adds	x1, x1, x12
-        umlal	v28.2d, v25.2s, v4.2s
-        adc	x16, xzr, xzr
-        subs	x15, x17, x7
-        sbcs	x7, x14, x1
-        lsl	x1, x15, #32
-        sbcs	x16, x13, x16
-        add	x8, x1, x15
-        usra	v18.2d, v3.2d, #32
-        sbcs	x14, x11, xzr
-        lsr	x1, x8, #32
-        sbcs	x17, x3, xzr
-        sbc	x11, x12, xzr
-        subs	x13, x1, x8
-        umulh	x12, x4, x10
-        sbc	x1, x8, xzr
-        extr	x13, x1, x13, #32
-        lsr	x1, x1, #32
-        adds	x15, x1, x8
-        adc	x1, xzr, xzr
-        subs	x7, x7, x13
-        sbcs	x13, x16, x15
-        lsl	x3, x7, #32
-        umulh	x16, x2, x5
-        sbcs	x15, x14, x1
-        add	x7, x3, x7
-        sbcs	x3, x17, xzr
-        lsr	x1, x7, #32
-        sbcs	x14, x11, xzr
-        sbc	x11, x8, xzr
-        subs	x8, x1, x7
-        sbc	x1, x7, xzr
-        extr	x8, x1, x8, #32
-        lsr	x1, x1, #32
-        adds	x1, x1, x7
-        adc	x17, xzr, xzr
-        subs	x13, x13, x8
-        umulh	x8, x9, x6
-        sbcs	x1, x15, x1
-        sbcs	x15, x3, x17
-        sbcs	x3, x14, xzr
-        mul	x17, x2, x5
-        sbcs	x11, x11, xzr
-        stp	x13, x1, [sp, #240]
-        sbc	x14, x7, xzr
-        mul	x7, x4, x10
-        subs	x1, x9, x2
-        stp	x15, x3, [sp, #256]
-        csetm	x15, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        stp	x11, x14, [sp, #272]
-        mul	x14, x9, x6
-        adds	x17, x8, x17
-        adcs	x7, x16, x7
-        adc	x13, x12, xzr
-        subs	x12, x5, x6
-        cneg	x3, x12, cc  // cc = lo, ul, last
-        cinv	x16, x15, cc  // cc = lo, ul, last
-        mul	x8, x1, x3
-        umulh	x1, x1, x3
-        eor	x12, x8, x16
-        adds	x11, x17, x14
-        adcs	x3, x7, x17
-        adcs	x15, x13, x7
-        adc	x8, x13, xzr
-        adds	x3, x3, x14
-        adcs	x15, x15, x17
-        adcs	x17, x8, x7
-        eor	x1, x1, x16
-        adc	x13, x13, xzr
-        subs	x9, x9, x4
-        csetm	x8, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x4, x2, x4
-        cneg	x4, x4, cc  // cc = lo, ul, last
-        csetm	x7, cc  // cc = lo, ul, last
-        subs	x2, x10, x6
-        cinv	x8, x8, cc  // cc = lo, ul, last
-        cneg	x2, x2, cc  // cc = lo, ul, last
-        cmn	x16, #0x1
-        adcs	x11, x11, x12
-        mul	x12, x9, x2
-        adcs	x3, x3, x1
-        adcs	x15, x15, x16
-        umulh	x9, x9, x2
-        adcs	x17, x17, x16
-        adc	x13, x13, x16
-        subs	x1, x10, x5
-        cinv	x2, x7, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        eor	x9, x9, x8
-        cmn	x8, #0x1
-        eor	x7, x12, x8
-        mul	x12, x4, x1
-        adcs	x3, x3, x7
-        adcs	x7, x15, x9
-        adcs	x15, x17, x8
-        ldp	x9, x17, [sp, #256]
-        umulh	x4, x4, x1
-        adc	x8, x13, x8
-        cmn	x2, #0x1
-        eor	x1, x12, x2
-        adcs	x1, x7, x1
-        ldp	x7, x16, [sp, #240]
-        eor	x12, x4, x2
-        adcs	x4, x15, x12
-        ldp	x15, x12, [sp, #272]
-        adc	x8, x8, x2
-        adds	x13, x14, x14
-        umulh	x14, x5, x10
-        adcs	x2, x11, x11
-        adcs	x3, x3, x3
-        adcs	x1, x1, x1
-        adcs	x4, x4, x4
-        adcs	x11, x8, x8
-        adc	x8, xzr, xzr
-        adds	x13, x13, x7
-        adcs	x2, x2, x16
-        mul	x16, x5, x10
-        adcs	x3, x3, x9
-        adcs	x1, x1, x17
-        umulh	x5, x5, x5
-        lsl	x9, x13, #32
-        add	x9, x9, x13
-        adcs	x4, x4, x15
-        mov	x13, v28.d[1]
-        adcs	x15, x11, x12
-        lsr	x7, x9, #32
-        adc	x11, x8, xzr
-        subs	x7, x7, x9
-        umulh	x10, x10, x10
-        sbc	x17, x9, xzr
-        extr	x7, x17, x7, #32
-        lsr	x17, x17, #32
-        adds	x17, x17, x9
-        adc	x12, xzr, xzr
-        subs	x8, x2, x7
-        sbcs	x17, x3, x17
-        lsl	x7, x8, #32
-        sbcs	x2, x1, x12
-        add	x3, x7, x8
-        sbcs	x12, x4, xzr
-        lsr	x1, x3, #32
-        sbcs	x7, x15, xzr
-        sbc	x15, x9, xzr
-        subs	x1, x1, x3
-        sbc	x4, x3, xzr
-        lsr	x9, x4, #32
-        extr	x8, x4, x1, #32
-        adds	x9, x9, x3
-        adc	x4, xzr, xzr
-        subs	x1, x17, x8
-        lsl	x17, x1, #32
-        sbcs	x8, x2, x9
-        sbcs	x9, x12, x4
-        add	x17, x17, x1
-        mov	x1, v18.d[1]
-        lsr	x2, x17, #32
-        sbcs	x7, x7, xzr
-        mov	x12, v18.d[0]
-        sbcs	x15, x15, xzr
-        sbc	x3, x3, xzr
-        subs	x4, x2, x17
-        sbc	x2, x17, xzr
-        adds	x12, x13, x12
-        adcs	x16, x16, x1
-        lsr	x13, x2, #32
-        extr	x1, x2, x4, #32
-        adc	x2, x14, xzr
-        adds	x4, x13, x17
-        mul	x13, x6, x6
-        adc	x14, xzr, xzr
-        subs	x1, x8, x1
-        sbcs	x4, x9, x4
-        mov	x9, v28.d[0]
-        sbcs	x7, x7, x14
-        sbcs	x8, x15, xzr
-        sbcs	x3, x3, xzr
-        sbc	x14, x17, xzr
-        adds	x17, x9, x9
-        adcs	x12, x12, x12
-        mov	x15, v19.d[0]
-        adcs	x9, x16, x16
-        umulh	x6, x6, x6
-        adcs	x16, x2, x2
-        adc	x2, xzr, xzr
-        adds	x11, x11, x8
-        adcs	x3, x3, xzr
-        adcs	x14, x14, xzr
-        adcs	x8, xzr, xzr
-        adds	x13, x1, x13
-        mov	x1, v19.d[1]
-        adcs	x6, x4, x6
-        mov	x4, #0xffffffff            	// #4294967295
-        adcs	x15, x7, x15
-        adcs	x7, x11, x5
-        adcs	x1, x3, x1
-        adcs	x14, x14, x10
-        adc	x11, x8, xzr
-        adds	x6, x6, x17
-        adcs	x8, x15, x12
-        adcs	x3, x7, x9
-        adcs	x15, x1, x16
-        mov	x16, #0xffffffff00000001    	// #-4294967295
-        adcs	x14, x14, x2
-        mov	x2, #0x1                   	// #1
-        adc	x17, x11, xzr
-        cmn	x13, x16
-        adcs	xzr, x6, x4
-        adcs	xzr, x8, x2
-        adcs	xzr, x3, xzr
-        adcs	xzr, x15, xzr
-        adcs	xzr, x14, xzr
-        adc	x1, x17, xzr
-        neg	x9, x1
-        and	x1, x16, x9
-        adds	x11, x13, x1
-        and	x13, x4, x9
-        adcs	x5, x6, x13
-        and	x1, x2, x9
-        adcs	x7, x8, x1
-        stp	x11, x5, [sp, #240]
-        adcs	x11, x3, xzr
-        adcs	x2, x15, xzr
-        stp	x7, x11, [sp, #256]
-        adc	x17, x14, xzr
-        stp	x2, x17, [sp, #272]
-        stp	x23, x24, [sp, #0x150]
-        ldr	q3, [x26, #96]
-        ldr	q25, [x25, #48]
-        ldp	x13, x23, [x25, #48]
-        ldp	x3, x21, [x26, #96]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [x25, #80]
-        ldp	x8, x24, [x26, #112]
-        subs	x6, x3, x21
-        ldr	q0, [x26, #128]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [x25, #64]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [x25, #80]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [x26, #128]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #288]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #304]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #320]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #288]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #304]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #320]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #288]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #304]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #320]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #288]
-        ldp	x21, x12, [sp, #304]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #320]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #288]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #304]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #320]
-        ldr	q3, [x25, #96]
-        ldr	q25, [x26, #48]
-        ldp	x13, x23, [x26, #48]
-        ldp	x3, x21, [x25, #96]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [x26, #80]
-        ldp	x8, x24, [x25, #112]
-        subs	x6, x3, x21
-        ldr	q0, [x25, #128]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [x26, #64]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [x26, #80]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [x25, #128]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #48]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #64]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #80]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #48]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #64]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #80]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #48]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #64]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #80]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #48]
-        ldp	x21, x12, [sp, #64]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #80]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #48]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #64]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #80]
-        mov	x1, sp
-        ldr	q3, [x1]
-        ldr	q25, [x26]
-        ldp	x13, x23, [x26]
-        ldp	x3, x21, [x1]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [x26, #32]
-        ldp	x8, x24, [x1, #16]
-        subs	x6, x3, x21
-        ldr	q0, [x1, #32]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [x26, #16]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [x26, #32]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [x1, #32]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #96]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #112]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #128]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #96]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #112]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #128]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #96]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #112]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #128]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #96]
-        ldp	x21, x12, [sp, #112]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #128]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #96]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #112]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #128]
-        ldr	q3, [sp, #240]
-        ldr	q25, [x25]
-        ldp	x13, x23, [x25]
-        ldp	x3, x21, [sp, #240]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [x25, #32]
-        ldp	x8, x24, [sp, #256]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #272]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [x25, #16]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [x25, #32]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #272]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #192]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #208]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #224]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #192]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #208]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #224]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #192]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #208]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #224]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #192]
-        ldp	x21, x12, [sp, #208]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #224]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #192]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #208]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #224]
-        mov	x1, sp
-        ldr	q3, [x1]
-        ldr	q25, [sp, #48]
-        ldp	x13, x23, [sp, #48]
-        ldp	x3, x21, [x1]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [sp, #80]
-        ldp	x8, x24, [x1, #16]
-        subs	x6, x3, x21
-        ldr	q0, [x1, #32]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [sp, #64]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [sp, #80]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [x1, #32]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #48]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #64]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #80]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #48]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #64]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #80]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #48]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #64]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #80]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #48]
-        ldp	x21, x12, [sp, #64]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #80]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #48]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #64]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #80]
-        ldr	q3, [sp, #240]
-        ldr	q25, [sp, #288]
-        ldp	x13, x23, [sp, #288]
-        ldp	x3, x21, [sp, #240]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [sp, #320]
-        ldp	x8, x24, [sp, #256]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #272]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [sp, #304]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [sp, #320]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #272]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #288]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #304]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #320]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #288]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #304]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #320]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #288]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #304]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #320]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #288]
-        ldp	x21, x12, [sp, #304]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #320]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x2, x24, x11
-        stp	x22, x5, [sp, #288]
-        adcs	x11, x13, x23
-        adcs	x12, x8, x23
-        stp	x2, x11, [sp, #304]
-        adc	x13, x15, x23
-        stp	x12, x13, [sp, #320]
-        ldp	x5, x6, [sp, #96]
-        ldp	x4, x3, [sp, #192]
-        subs	x5, x5, x4
-        sbcs	x6, x6, x3
-        ldp	x7, x8, [sp, #112]
-        ldp	x4, x3, [sp, #208]
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        ldp	x9, x10, [sp, #128]
-        ldp	x4, x3, [sp, #224]
-        sbcs	x9, x9, x4
-        sbcs	x10, x10, x3
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x5, x5, x4
-        eor	x4, x4, x3
-        adcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x5, x6, [sp, #240]
-        stp	x7, x8, [sp, #256]
-        stp	x9, x10, [sp, #272]
-        ldp	x5, x6, [sp, #48]
-        ldp	x4, x3, [sp, #288]
-        subs	x5, x5, x4
-        sbcs	x6, x6, x3
-        ldp	x7, x8, [sp, #64]
-        sbcs	x7, x7, x2
-        sbcs	x8, x8, x11
-        ldp	x9, x10, [sp, #80]
-        sbcs	x9, x9, x12
-        sbcs	x10, x10, x13
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x5, x5, x4
-        eor	x4, x4, x3
-        adcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x5, x6, [sp, #48]
-        stp	x7, x8, [sp, #64]
-        stp	x9, x10, [sp, #80]
-        ldr	q1, [sp, #240]
-        ldp	x9, x2, [sp, #240]
-        ldr	q0, [sp, #240]
-        ldp	x4, x6, [sp, #256]
-        rev64	v21.4s, v1.4s
-        uzp2	v28.4s, v1.4s, v1.4s
-        umulh	x7, x9, x2
-        xtn	v17.2s, v1.2d
-        mul	v27.4s, v21.4s, v0.4s
-        ldr	q20, [sp, #272]
-        xtn	v30.2s, v0.2d
-        ldr	q1, [sp, #272]
-        uzp2	v31.4s, v0.4s, v0.4s
-        ldp	x5, x10, [sp, #272]
-        umulh	x8, x9, x4
-        uaddlp	v3.2d, v27.4s
-        umull	v16.2d, v30.2s, v17.2s
-        mul	x16, x9, x4
-        umull	v27.2d, v30.2s, v28.2s
-        shrn	v0.2s, v20.2d, #32
-        xtn	v7.2s, v20.2d
-        shl	v20.2d, v3.2d, #32
-        umull	v3.2d, v31.2s, v28.2s
-        mul	x3, x2, x4
-        umlal	v20.2d, v30.2s, v17.2s
-        umull	v22.2d, v7.2s, v0.2s
-        usra	v27.2d, v16.2d, #32
-        umulh	x11, x2, x4
-        movi	v21.2d, #0xffffffff
-        uzp2	v28.4s, v1.4s, v1.4s
-        adds	x15, x16, x7
-        and	v5.16b, v27.16b, v21.16b
-        adcs	x3, x3, x8
-        usra	v3.2d, v27.2d, #32
-        dup	v29.2d, x6
-        adcs	x16, x11, xzr
-        mov	x14, v20.d[0]
-        umlal	v5.2d, v31.2s, v17.2s
-        mul	x8, x9, x2
-        mov	x7, v20.d[1]
-        shl	v19.2d, v22.2d, #33
-        xtn	v25.2s, v29.2d
-        rev64	v31.4s, v1.4s
-        lsl	x13, x14, #32
-        uzp2	v6.4s, v29.4s, v29.4s
-        umlal	v19.2d, v7.2s, v7.2s
-        usra	v3.2d, v5.2d, #32
-        adds	x1, x8, x8
-        umulh	x8, x4, x4
-        add	x12, x13, x14
-        mul	v17.4s, v31.4s, v29.4s
-        xtn	v4.2s, v1.2d
-        adcs	x14, x15, x15
-        lsr	x13, x12, #32
-        adcs	x15, x3, x3
-        umull	v31.2d, v25.2s, v28.2s
-        adcs	x11, x16, x16
-        umull	v21.2d, v25.2s, v4.2s
-        mov	x17, v3.d[0]
-        umull	v18.2d, v6.2s, v28.2s
-        adc	x16, x8, xzr
-        uaddlp	v16.2d, v17.4s
-        movi	v1.2d, #0xffffffff
-        subs	x13, x13, x12
-        usra	v31.2d, v21.2d, #32
-        sbc	x8, x12, xzr
-        adds	x17, x17, x1
-        mul	x1, x4, x4
-        shl	v28.2d, v16.2d, #32
-        mov	x3, v3.d[1]
-        adcs	x14, x7, x14
-        extr	x7, x8, x13, #32
-        adcs	x13, x3, x15
-        and	v3.16b, v31.16b, v1.16b
-        adcs	x11, x1, x11
-        lsr	x1, x8, #32
-        umlal	v3.2d, v6.2s, v4.2s
-        usra	v18.2d, v31.2d, #32
-        adc	x3, x16, xzr
-        adds	x1, x1, x12
-        umlal	v28.2d, v25.2s, v4.2s
-        adc	x16, xzr, xzr
-        subs	x15, x17, x7
-        sbcs	x7, x14, x1
-        lsl	x1, x15, #32
-        sbcs	x16, x13, x16
-        add	x8, x1, x15
-        usra	v18.2d, v3.2d, #32
-        sbcs	x14, x11, xzr
-        lsr	x1, x8, #32
-        sbcs	x17, x3, xzr
-        sbc	x11, x12, xzr
-        subs	x13, x1, x8
-        umulh	x12, x4, x10
-        sbc	x1, x8, xzr
-        extr	x13, x1, x13, #32
-        lsr	x1, x1, #32
-        adds	x15, x1, x8
-        adc	x1, xzr, xzr
-        subs	x7, x7, x13
-        sbcs	x13, x16, x15
-        lsl	x3, x7, #32
-        umulh	x16, x2, x5
-        sbcs	x15, x14, x1
-        add	x7, x3, x7
-        sbcs	x3, x17, xzr
-        lsr	x1, x7, #32
-        sbcs	x14, x11, xzr
-        sbc	x11, x8, xzr
-        subs	x8, x1, x7
-        sbc	x1, x7, xzr
-        extr	x8, x1, x8, #32
-        lsr	x1, x1, #32
-        adds	x1, x1, x7
-        adc	x17, xzr, xzr
-        subs	x13, x13, x8
-        umulh	x8, x9, x6
-        sbcs	x1, x15, x1
-        sbcs	x15, x3, x17
-        sbcs	x3, x14, xzr
-        mul	x17, x2, x5
-        sbcs	x11, x11, xzr
-        stp	x13, x1, [sp, #144]
-        sbc	x14, x7, xzr
-        mul	x7, x4, x10
-        subs	x1, x9, x2
-        stp	x15, x3, [sp, #160]
-        csetm	x15, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        stp	x11, x14, [sp, #176]
-        mul	x14, x9, x6
-        adds	x17, x8, x17
-        adcs	x7, x16, x7
-        adc	x13, x12, xzr
-        subs	x12, x5, x6
-        cneg	x3, x12, cc  // cc = lo, ul, last
-        cinv	x16, x15, cc  // cc = lo, ul, last
-        mul	x8, x1, x3
-        umulh	x1, x1, x3
-        eor	x12, x8, x16
-        adds	x11, x17, x14
-        adcs	x3, x7, x17
-        adcs	x15, x13, x7
-        adc	x8, x13, xzr
-        adds	x3, x3, x14
-        adcs	x15, x15, x17
-        adcs	x17, x8, x7
-        eor	x1, x1, x16
-        adc	x13, x13, xzr
-        subs	x9, x9, x4
-        csetm	x8, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x4, x2, x4
-        cneg	x4, x4, cc  // cc = lo, ul, last
-        csetm	x7, cc  // cc = lo, ul, last
-        subs	x2, x10, x6
-        cinv	x8, x8, cc  // cc = lo, ul, last
-        cneg	x2, x2, cc  // cc = lo, ul, last
-        cmn	x16, #0x1
-        adcs	x11, x11, x12
-        mul	x12, x9, x2
-        adcs	x3, x3, x1
-        adcs	x15, x15, x16
-        umulh	x9, x9, x2
-        adcs	x17, x17, x16
-        adc	x13, x13, x16
-        subs	x1, x10, x5
-        cinv	x2, x7, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        eor	x9, x9, x8
-        cmn	x8, #0x1
-        eor	x7, x12, x8
-        mul	x12, x4, x1
-        adcs	x3, x3, x7
-        adcs	x7, x15, x9
-        adcs	x15, x17, x8
-        ldp	x9, x17, [sp, #160]
-        umulh	x4, x4, x1
-        adc	x8, x13, x8
-        cmn	x2, #0x1
-        eor	x1, x12, x2
-        adcs	x1, x7, x1
-        ldp	x7, x16, [sp, #144]
-        eor	x12, x4, x2
-        adcs	x4, x15, x12
-        ldp	x15, x12, [sp, #176]
-        adc	x8, x8, x2
-        adds	x13, x14, x14
-        umulh	x14, x5, x10
-        adcs	x2, x11, x11
-        adcs	x3, x3, x3
-        adcs	x1, x1, x1
-        adcs	x4, x4, x4
-        adcs	x11, x8, x8
-        adc	x8, xzr, xzr
-        adds	x13, x13, x7
-        adcs	x2, x2, x16
-        mul	x16, x5, x10
-        adcs	x3, x3, x9
-        adcs	x1, x1, x17
-        umulh	x5, x5, x5
-        lsl	x9, x13, #32
-        add	x9, x9, x13
-        adcs	x4, x4, x15
-        mov	x13, v28.d[1]
-        adcs	x15, x11, x12
-        lsr	x7, x9, #32
-        adc	x11, x8, xzr
-        subs	x7, x7, x9
-        umulh	x10, x10, x10
-        sbc	x17, x9, xzr
-        extr	x7, x17, x7, #32
-        lsr	x17, x17, #32
-        adds	x17, x17, x9
-        adc	x12, xzr, xzr
-        subs	x8, x2, x7
-        sbcs	x17, x3, x17
-        lsl	x7, x8, #32
-        sbcs	x2, x1, x12
-        add	x3, x7, x8
-        sbcs	x12, x4, xzr
-        lsr	x1, x3, #32
-        sbcs	x7, x15, xzr
-        sbc	x15, x9, xzr
-        subs	x1, x1, x3
-        sbc	x4, x3, xzr
-        lsr	x9, x4, #32
-        extr	x8, x4, x1, #32
-        adds	x9, x9, x3
-        adc	x4, xzr, xzr
-        subs	x1, x17, x8
-        lsl	x17, x1, #32
-        sbcs	x8, x2, x9
-        sbcs	x9, x12, x4
-        add	x17, x17, x1
-        mov	x1, v18.d[1]
-        lsr	x2, x17, #32
-        sbcs	x7, x7, xzr
-        mov	x12, v18.d[0]
-        sbcs	x15, x15, xzr
-        sbc	x3, x3, xzr
-        subs	x4, x2, x17
-        sbc	x2, x17, xzr
-        adds	x12, x13, x12
-        adcs	x16, x16, x1
-        lsr	x13, x2, #32
-        extr	x1, x2, x4, #32
-        adc	x2, x14, xzr
-        adds	x4, x13, x17
-        mul	x13, x6, x6
-        adc	x14, xzr, xzr
-        subs	x1, x8, x1
-        sbcs	x4, x9, x4
-        mov	x9, v28.d[0]
-        sbcs	x7, x7, x14
-        sbcs	x8, x15, xzr
-        sbcs	x3, x3, xzr
-        sbc	x14, x17, xzr
-        adds	x17, x9, x9
-        adcs	x12, x12, x12
-        mov	x15, v19.d[0]
-        adcs	x9, x16, x16
-        umulh	x6, x6, x6
-        adcs	x16, x2, x2
-        adc	x2, xzr, xzr
-        adds	x11, x11, x8
-        adcs	x3, x3, xzr
-        adcs	x14, x14, xzr
-        adcs	x8, xzr, xzr
-        adds	x13, x1, x13
-        mov	x1, v19.d[1]
-        adcs	x6, x4, x6
-        mov	x4, #0xffffffff            	// #4294967295
-        adcs	x15, x7, x15
-        adcs	x7, x11, x5
-        adcs	x1, x3, x1
-        adcs	x14, x14, x10
-        adc	x11, x8, xzr
-        adds	x6, x6, x17
-        adcs	x8, x15, x12
-        adcs	x3, x7, x9
-        adcs	x15, x1, x16
-        mov	x16, #0xffffffff00000001    	// #-4294967295
-        adcs	x14, x14, x2
-        mov	x2, #0x1                   	// #1
-        adc	x17, x11, xzr
-        cmn	x13, x16
-        adcs	xzr, x6, x4
-        adcs	xzr, x8, x2
-        adcs	xzr, x3, xzr
-        adcs	xzr, x15, xzr
-        adcs	xzr, x14, xzr
-        adc	x1, x17, xzr
-        neg	x9, x1
-        and	x1, x16, x9
-        adds	x11, x13, x1
-        and	x13, x4, x9
-        adcs	x5, x6, x13
-        and	x1, x2, x9
-        adcs	x7, x8, x1
-        stp	x11, x5, [sp, #144]
-        adcs	x11, x3, xzr
-        adcs	x2, x15, xzr
-        stp	x7, x11, [sp, #160]
-        adc	x17, x14, xzr
-        stp	x2, x17, [sp, #176]
-        mov	x0, sp
-        ldr	q1, [sp, #48]
-        ldp	x9, x2, [sp, #48]
-        ldr	q0, [sp, #48]
-        ldp	x4, x6, [sp, #64]
-        rev64	v21.4s, v1.4s
-        uzp2	v28.4s, v1.4s, v1.4s
-        umulh	x7, x9, x2
-        xtn	v17.2s, v1.2d
-        mul	v27.4s, v21.4s, v0.4s
-        ldr	q20, [sp, #80]
-        xtn	v30.2s, v0.2d
-        ldr	q1, [sp, #80]
-        uzp2	v31.4s, v0.4s, v0.4s
-        ldp	x5, x10, [sp, #80]
-        umulh	x8, x9, x4
-        uaddlp	v3.2d, v27.4s
-        umull	v16.2d, v30.2s, v17.2s
-        mul	x16, x9, x4
-        umull	v27.2d, v30.2s, v28.2s
-        shrn	v0.2s, v20.2d, #32
-        xtn	v7.2s, v20.2d
-        shl	v20.2d, v3.2d, #32
-        umull	v3.2d, v31.2s, v28.2s
-        mul	x3, x2, x4
-        umlal	v20.2d, v30.2s, v17.2s
-        umull	v22.2d, v7.2s, v0.2s
-        usra	v27.2d, v16.2d, #32
-        umulh	x11, x2, x4
-        movi	v21.2d, #0xffffffff
-        uzp2	v28.4s, v1.4s, v1.4s
-        adds	x15, x16, x7
-        and	v5.16b, v27.16b, v21.16b
-        adcs	x3, x3, x8
-        usra	v3.2d, v27.2d, #32
-        dup	v29.2d, x6
-        adcs	x16, x11, xzr
-        mov	x14, v20.d[0]
-        umlal	v5.2d, v31.2s, v17.2s
-        mul	x8, x9, x2
-        mov	x7, v20.d[1]
-        shl	v19.2d, v22.2d, #33
-        xtn	v25.2s, v29.2d
-        rev64	v31.4s, v1.4s
-        lsl	x13, x14, #32
-        uzp2	v6.4s, v29.4s, v29.4s
-        umlal	v19.2d, v7.2s, v7.2s
-        usra	v3.2d, v5.2d, #32
-        adds	x1, x8, x8
-        umulh	x8, x4, x4
-        add	x12, x13, x14
-        mul	v17.4s, v31.4s, v29.4s
-        xtn	v4.2s, v1.2d
-        adcs	x14, x15, x15
-        lsr	x13, x12, #32
-        adcs	x15, x3, x3
-        umull	v31.2d, v25.2s, v28.2s
-        adcs	x11, x16, x16
-        umull	v21.2d, v25.2s, v4.2s
-        mov	x17, v3.d[0]
-        umull	v18.2d, v6.2s, v28.2s
-        adc	x16, x8, xzr
-        uaddlp	v16.2d, v17.4s
-        movi	v1.2d, #0xffffffff
-        subs	x13, x13, x12
-        usra	v31.2d, v21.2d, #32
-        sbc	x8, x12, xzr
-        adds	x17, x17, x1
-        mul	x1, x4, x4
-        shl	v28.2d, v16.2d, #32
-        mov	x3, v3.d[1]
-        adcs	x14, x7, x14
-        extr	x7, x8, x13, #32
-        adcs	x13, x3, x15
-        and	v3.16b, v31.16b, v1.16b
-        adcs	x11, x1, x11
-        lsr	x1, x8, #32
-        umlal	v3.2d, v6.2s, v4.2s
-        usra	v18.2d, v31.2d, #32
-        adc	x3, x16, xzr
-        adds	x1, x1, x12
-        umlal	v28.2d, v25.2s, v4.2s
-        adc	x16, xzr, xzr
-        subs	x15, x17, x7
-        sbcs	x7, x14, x1
-        lsl	x1, x15, #32
-        sbcs	x16, x13, x16
-        add	x8, x1, x15
-        usra	v18.2d, v3.2d, #32
-        sbcs	x14, x11, xzr
-        lsr	x1, x8, #32
-        sbcs	x17, x3, xzr
-        sbc	x11, x12, xzr
-        subs	x13, x1, x8
-        umulh	x12, x4, x10
-        sbc	x1, x8, xzr
-        extr	x13, x1, x13, #32
-        lsr	x1, x1, #32
-        adds	x15, x1, x8
-        adc	x1, xzr, xzr
-        subs	x7, x7, x13
-        sbcs	x13, x16, x15
-        lsl	x3, x7, #32
-        umulh	x16, x2, x5
-        sbcs	x15, x14, x1
-        add	x7, x3, x7
-        sbcs	x3, x17, xzr
-        lsr	x1, x7, #32
-        sbcs	x14, x11, xzr
-        sbc	x11, x8, xzr
-        subs	x8, x1, x7
-        sbc	x1, x7, xzr
-        extr	x8, x1, x8, #32
-        lsr	x1, x1, #32
-        adds	x1, x1, x7
-        adc	x17, xzr, xzr
-        subs	x13, x13, x8
-        umulh	x8, x9, x6
-        sbcs	x1, x15, x1
-        sbcs	x15, x3, x17
-        sbcs	x3, x14, xzr
-        mul	x17, x2, x5
-        sbcs	x11, x11, xzr
-        stp	x13, x1, [x0]
-        sbc	x14, x7, xzr
-        mul	x7, x4, x10
-        subs	x1, x9, x2
-        stp	x15, x3, [x0, #16]
-        csetm	x15, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        stp	x11, x14, [x0, #32]
-        mul	x14, x9, x6
-        adds	x17, x8, x17
-        adcs	x7, x16, x7
-        adc	x13, x12, xzr
-        subs	x12, x5, x6
-        cneg	x3, x12, cc  // cc = lo, ul, last
-        cinv	x16, x15, cc  // cc = lo, ul, last
-        mul	x8, x1, x3
-        umulh	x1, x1, x3
-        eor	x12, x8, x16
-        adds	x11, x17, x14
-        adcs	x3, x7, x17
-        adcs	x15, x13, x7
-        adc	x8, x13, xzr
-        adds	x3, x3, x14
-        adcs	x15, x15, x17
-        adcs	x17, x8, x7
-        eor	x1, x1, x16
-        adc	x13, x13, xzr
-        subs	x9, x9, x4
-        csetm	x8, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x4, x2, x4
-        cneg	x4, x4, cc  // cc = lo, ul, last
-        csetm	x7, cc  // cc = lo, ul, last
-        subs	x2, x10, x6
-        cinv	x8, x8, cc  // cc = lo, ul, last
-        cneg	x2, x2, cc  // cc = lo, ul, last
-        cmn	x16, #0x1
-        adcs	x11, x11, x12
-        mul	x12, x9, x2
-        adcs	x3, x3, x1
-        adcs	x15, x15, x16
-        umulh	x9, x9, x2
-        adcs	x17, x17, x16
-        adc	x13, x13, x16
-        subs	x1, x10, x5
-        cinv	x2, x7, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        eor	x9, x9, x8
-        cmn	x8, #0x1
-        eor	x7, x12, x8
-        mul	x12, x4, x1
-        adcs	x3, x3, x7
-        adcs	x7, x15, x9
-        adcs	x15, x17, x8
-        ldp	x9, x17, [x0, #16]
-        umulh	x4, x4, x1
-        adc	x8, x13, x8
-        cmn	x2, #0x1
-        eor	x1, x12, x2
-        adcs	x1, x7, x1
-        ldp	x7, x16, [x0]
-        eor	x12, x4, x2
-        adcs	x4, x15, x12
-        ldp	x15, x12, [x0, #32]
-        adc	x8, x8, x2
-        adds	x13, x14, x14
-        umulh	x14, x5, x10
-        adcs	x2, x11, x11
-        adcs	x3, x3, x3
-        adcs	x1, x1, x1
-        adcs	x4, x4, x4
-        adcs	x11, x8, x8
-        adc	x8, xzr, xzr
-        adds	x13, x13, x7
-        adcs	x2, x2, x16
-        mul	x16, x5, x10
-        adcs	x3, x3, x9
-        adcs	x1, x1, x17
-        umulh	x5, x5, x5
-        lsl	x9, x13, #32
-        add	x9, x9, x13
-        adcs	x4, x4, x15
-        mov	x13, v28.d[1]
-        adcs	x15, x11, x12
-        lsr	x7, x9, #32
-        adc	x11, x8, xzr
-        subs	x7, x7, x9
-        umulh	x10, x10, x10
-        sbc	x17, x9, xzr
-        extr	x7, x17, x7, #32
-        lsr	x17, x17, #32
-        adds	x17, x17, x9
-        adc	x12, xzr, xzr
-        subs	x8, x2, x7
-        sbcs	x17, x3, x17
-        lsl	x7, x8, #32
-        sbcs	x2, x1, x12
-        add	x3, x7, x8
-        sbcs	x12, x4, xzr
-        lsr	x1, x3, #32
-        sbcs	x7, x15, xzr
-        sbc	x15, x9, xzr
-        subs	x1, x1, x3
-        sbc	x4, x3, xzr
-        lsr	x9, x4, #32
-        extr	x8, x4, x1, #32
-        adds	x9, x9, x3
-        adc	x4, xzr, xzr
-        subs	x1, x17, x8
-        lsl	x17, x1, #32
-        sbcs	x8, x2, x9
-        sbcs	x9, x12, x4
-        add	x17, x17, x1
-        mov	x1, v18.d[1]
-        lsr	x2, x17, #32
-        sbcs	x7, x7, xzr
-        mov	x12, v18.d[0]
-        sbcs	x15, x15, xzr
-        sbc	x3, x3, xzr
-        subs	x4, x2, x17
-        sbc	x2, x17, xzr
-        adds	x12, x13, x12
-        adcs	x16, x16, x1
-        lsr	x13, x2, #32
-        extr	x1, x2, x4, #32
-        adc	x2, x14, xzr
-        adds	x4, x13, x17
-        mul	x13, x6, x6
-        adc	x14, xzr, xzr
-        subs	x1, x8, x1
-        sbcs	x4, x9, x4
-        mov	x9, v28.d[0]
-        sbcs	x7, x7, x14
-        sbcs	x8, x15, xzr
-        sbcs	x3, x3, xzr
-        sbc	x14, x17, xzr
-        adds	x17, x9, x9
-        adcs	x12, x12, x12
-        mov	x15, v19.d[0]
-        adcs	x9, x16, x16
-        umulh	x6, x6, x6
-        adcs	x16, x2, x2
-        adc	x2, xzr, xzr
-        adds	x11, x11, x8
-        adcs	x3, x3, xzr
-        adcs	x14, x14, xzr
-        adcs	x8, xzr, xzr
-        adds	x13, x1, x13
-        mov	x1, v19.d[1]
-        adcs	x6, x4, x6
-        mov	x4, #0xffffffff            	// #4294967295
-        adcs	x15, x7, x15
-        adcs	x7, x11, x5
-        adcs	x1, x3, x1
-        adcs	x14, x14, x10
-        adc	x11, x8, xzr
-        adds	x6, x6, x17
-        adcs	x8, x15, x12
-        adcs	x3, x7, x9
-        adcs	x15, x1, x16
-        mov	x16, #0xffffffff00000001    	// #-4294967295
-        adcs	x14, x14, x2
-        mov	x2, #0x1                   	// #1
-        adc	x17, x11, xzr
-        cmn	x13, x16
-        adcs	xzr, x6, x4
-        adcs	xzr, x8, x2
-        adcs	xzr, x3, xzr
-        adcs	xzr, x15, xzr
-        adcs	xzr, x14, xzr
-        adc	x1, x17, xzr
-        neg	x9, x1
-        and	x1, x16, x9
-        adds	x11, x13, x1
-        and	x13, x4, x9
-        adcs	x5, x6, x13
-        and	x1, x2, x9
-        adcs	x7, x8, x1
-        stp	x11, x5, [x0]
-        adcs	x11, x3, xzr
-        adcs	x2, x15, xzr
-        stp	x7, x11, [x0, #16]
-        adc	x17, x14, xzr
-        stp	x2, x17, [x0, #32]
-        ldr	q3, [sp, #144]
-        ldr	q25, [sp, #192]
-        ldp	x13, x23, [sp, #192]
-        ldp	x3, x21, [sp, #144]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [sp, #224]
-        ldp	x8, x24, [sp, #160]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #176]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [sp, #208]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [sp, #224]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #176]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #192]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #208]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #224]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #192]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #208]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #224]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #192]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #208]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #224]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #192]
-        ldp	x21, x12, [sp, #208]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #224]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #192]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #208]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #224]
-        ldr	q3, [sp, #144]
-        ldr	q25, [sp, #96]
-        ldp	x13, x23, [sp, #96]
-        ldp	x3, x21, [sp, #144]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [sp, #128]
-        ldp	x8, x24, [sp, #160]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #176]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [sp, #112]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [sp, #128]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #176]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #96]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #112]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #128]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #96]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #112]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #128]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #96]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #112]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #128]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #96]
-        ldp	x21, x12, [sp, #112]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #128]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x2, x24, x11
-        stp	x22, x5, [sp, #96]
-        adcs	x11, x13, x23
-        adcs	x12, x8, x23
-        stp	x2, x11, [sp, #112]
-        adc	x13, x15, x23
-        stp	x12, x13, [sp, #128]
-        mov	x0, sp
-        mov	x1, sp
-        ldp	x5, x6, [x1]
-        ldp	x4, x3, [sp, #192]
-        subs	x5, x5, x4
-        sbcs	x6, x6, x3
-        ldp	x7, x8, [x1, #16]
-        ldp	x4, x3, [sp, #208]
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        ldp	x9, x10, [x1, #32]
-        ldp	x4, x3, [sp, #224]
-        sbcs	x9, x9, x4
-        sbcs	x10, x10, x3
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x5, x5, x4
-        eor	x4, x4, x3
-        adcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x5, x6, [x0]
-        stp	x7, x8, [x0, #16]
-        stp	x9, x10, [x0, #32]
-        ldp	x5, x6, [sp, #96]
-        ldp	x4, x3, [sp, #192]
-        subs	x5, x5, x4
-        sbcs	x6, x6, x3
-        ldp	x4, x3, [sp, #208]
-        sbcs	x7, x2, x4
-        sbcs	x8, x11, x3
-        ldp	x4, x3, [sp, #224]
-        sbcs	x9, x12, x4
-        sbcs	x10, x13, x3
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x5, x5, x4
-        eor	x4, x4, x3
-        adcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x5, x6, [sp, #144]
-        stp	x7, x8, [sp, #160]
-        stp	x9, x10, [sp, #176]
-        ldr	q3, [sp, #240]
-        ldr	q25, [x25, #96]
-        ldp	x13, x23, [x25, #96]
-        ldp	x3, x21, [sp, #240]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [x25, #128]
-        ldp	x8, x24, [sp, #256]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #272]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [x25, #112]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [x25, #128]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #272]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #240]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #256]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #272]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #240]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #256]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #272]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #240]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #256]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #272]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #240]
-        ldp	x21, x12, [sp, #256]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #272]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #240]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #256]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #272]
-        mov	x0, sp
-        mov	x1, sp
-        ldp	x5, x6, [x1]
-        ldp	x4, x3, [sp, #96]
-        subs	x5, x5, x4
-        sbcs	x6, x6, x3
-        ldp	x7, x8, [x1, #16]
-        ldp	x4, x3, [sp, #112]
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        ldp	x9, x10, [x1, #32]
-        ldp	x4, x3, [sp, #128]
-        sbcs	x9, x9, x4
-        sbcs	x10, x10, x3
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x2, x5, x4
-        eor	x4, x4, x3
-        adcs	x11, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x4, x7, x4
-        adcs	x12, x8, x3
-        adcs	x13, x9, x3
-        adc	x3, x10, x3
-        stp	x2, x11, [x0]
-        stp	x4, x12, [x0, #16]
-        stp	x13, x3, [x0, #32]
-        ldp	x5, x6, [sp, #192]
-        subs	x5, x5, x2
-        sbcs	x6, x6, x11
-        ldp	x7, x8, [sp, #208]
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x12
-        ldp	x9, x10, [sp, #224]
-        sbcs	x9, x9, x13
-        sbcs	x10, x10, x3
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x5, x5, x4
-        eor	x4, x4, x3
-        adcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x5, x6, [sp, #192]
-        stp	x7, x8, [sp, #208]
-        stp	x9, x10, [sp, #224]
-        ldr	q3, [sp, #144]
-        ldr	q25, [sp, #288]
-        ldp	x13, x23, [sp, #288]
-        ldp	x3, x21, [sp, #144]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [sp, #320]
-        ldp	x8, x24, [sp, #160]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #176]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [sp, #304]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [sp, #320]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #176]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #144]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #160]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #176]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #144]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #160]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #176]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #144]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #160]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #176]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #144]
-        ldp	x21, x12, [sp, #160]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #176]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #144]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #160]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #176]
-        ldr	q3, [sp, #240]
-        ldr	q25, [x26, #96]
-        ldp	x13, x23, [x26, #96]
-        ldp	x3, x21, [sp, #240]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [x26, #128]
-        ldp	x8, x24, [sp, #256]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #272]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [x26, #112]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [x26, #128]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #272]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #240]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #256]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #272]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #240]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #256]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #272]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #240]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #256]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #272]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #240]
-        ldp	x21, x12, [sp, #256]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #272]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #240]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #256]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #272]
-        ldp	x2, x27, [sp, #0x150]
-        ldr	q3, [sp, #48]
-        ldr	q25, [sp, #192]
-        ldp	x13, x23, [sp, #192]
-        ldp	x3, x21, [sp, #48]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [sp, #224]
-        ldp	x8, x24, [sp, #64]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #80]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [sp, #208]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [sp, #224]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #80]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #192]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #208]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #224]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #192]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #208]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #224]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #192]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #208]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #224]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #192]
-        ldp	x21, x12, [sp, #208]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #224]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x2, x6, x20
-        eor	x3, x20, x23
-        adcs	x6, x7, x3
-        adcs	x7, x24, x11
-        adcs	x9, x13, x23
-        adcs	x10, x8, x23
-        adc	x11, x15, x23
-        ldp	x4, x3, [sp, #144]
-        subs	x5, x2, x4
-        sbcs	x6, x6, x3
-        ldp	x4, x3, [sp, #160]
-        sbcs	x7, x7, x4
-        sbcs	x8, x9, x3
-        ldp	x4, x3, [sp, #176]
-        sbcs	x9, x10, x4
-        sbcs	x10, x11, x3
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x19, x5, x4
-        eor	x4, x4, x3
-        adcs	x24, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x7, x8, [sp, #208]
-        stp	x9, x10, [sp, #224]
-        ldp	x0, x1, [x25, #96]
-        ldp	x2, x3, [x25, #112]
-        ldp	x4, x5, [x25, #128]
-        orr	x20, x0, x1
-        orr	x21, x2, x3
-        orr	x22, x4, x5
-        orr	x20, x20, x21
-        orr	x20, x20, x22
-        cmp	x20, xzr
-        cset	x20, ne  // ne = any
-        ldp	x6, x7, [x26, #96]
-        ldp	x8, x9, [x26, #112]
-        ldp	x10, x11, [x26, #128]
-        orr	x21, x6, x7
-        orr	x22, x8, x9
-        orr	x23, x10, x11
-        orr	x21, x21, x22
-        orr	x21, x21, x23
-        cmp	x21, xzr
-        cset	x21, ne  // ne = any
-        cmp	x21, x20
-        ldp	x12, x13, [sp, #240]
-        csel	x12, x0, x12, cc  // cc = lo, ul, last
-        csel	x13, x1, x13, cc  // cc = lo, ul, last
-        csel	x12, x6, x12, hi  // hi = pmore
-        csel	x13, x7, x13, hi  // hi = pmore
-        ldp	x14, x15, [sp, #256]
-        csel	x14, x2, x14, cc  // cc = lo, ul, last
-        csel	x15, x3, x15, cc  // cc = lo, ul, last
-        csel	x14, x8, x14, hi  // hi = pmore
-        csel	x15, x9, x15, hi  // hi = pmore
-        ldp	x16, x17, [sp, #272]
-        csel	x16, x4, x16, cc  // cc = lo, ul, last
-        csel	x17, x5, x17, cc  // cc = lo, ul, last
-        csel	x16, x10, x16, hi  // hi = pmore
-        csel	x17, x11, x17, hi  // hi = pmore
-        ldp	x20, x21, [x25]
-        ldp	x0, x1, [sp]
-        csel	x0, x20, x0, cc  // cc = lo, ul, last
-        csel	x1, x21, x1, cc  // cc = lo, ul, last
-        ldp	x20, x21, [x26]
-        csel	x0, x20, x0, hi  // hi = pmore
-        csel	x1, x21, x1, hi  // hi = pmore
-        ldp	x20, x21, [x25, #16]
-        ldp	x2, x3, [sp, #16]
-        csel	x2, x20, x2, cc  // cc = lo, ul, last
-        csel	x3, x21, x3, cc  // cc = lo, ul, last
-        ldp	x20, x21, [x26, #16]
-        csel	x2, x20, x2, hi  // hi = pmore
-        csel	x3, x21, x3, hi  // hi = pmore
-        ldp	x20, x21, [x25, #32]
-        ldp	x4, x5, [sp, #32]
-        csel	x4, x20, x4, cc  // cc = lo, ul, last
-        csel	x5, x21, x5, cc  // cc = lo, ul, last
-        ldp	x20, x21, [x26, #32]
-        csel	x4, x20, x4, hi  // hi = pmore
-        csel	x5, x21, x5, hi  // hi = pmore
-        ldp	x20, x21, [x25, #48]
-        csel	x6, x20, x19, cc  // cc = lo, ul, last
-        csel	x7, x21, x24, cc  // cc = lo, ul, last
-        ldp	x20, x21, [x26, #48]
-        csel	x6, x20, x6, hi  // hi = pmore
-        csel	x7, x21, x7, hi  // hi = pmore
-        ldp	x20, x21, [x25, #64]
-        ldp	x8, x9, [sp, #208]
-        csel	x8, x20, x8, cc  // cc = lo, ul, last
-        csel	x9, x21, x9, cc  // cc = lo, ul, last
-        ldp	x20, x21, [x26, #64]
-        csel	x8, x20, x8, hi  // hi = pmore
-        csel	x9, x21, x9, hi  // hi = pmore
-        ldp	x20, x21, [x25, #80]
-        ldp	x10, x11, [sp, #224]
-        csel	x10, x20, x10, cc  // cc = lo, ul, last
-        csel	x11, x21, x11, cc  // cc = lo, ul, last
-        ldp	x20, x21, [x26, #80]
-        csel	x10, x20, x10, hi  // hi = pmore
-        csel	x11, x21, x11, hi  // hi = pmore
-        stp	x0, x1, [x27]
-        stp	x2, x3, [x27, #16]
-        stp	x4, x5, [x27, #32]
-        stp	x6, x7, [x27, #48]
-        stp	x8, x9, [x27, #64]
-        stp	x10, x11, [x27, #80]
-        stp	x12, x13, [x27, #96]
-        stp	x14, x15, [x27, #112]
-        stp	x16, x17, [x27, #128]
-        add	sp, sp, #0x180
-        ldp	x27, xzr, [sp], #16
-        ldp	x25, x26, [sp], #16
-        ldp	x23, x24, [sp], #16
-        ldp	x21, x22, [sp], #16
-        ldp	x19, x20, [sp], #16
-        ret
-
-p384_montjscalarmul_p384_montjdouble:
-        sub	sp, sp, #0x1a0
-        stp	x19, x20, [sp, #336]
-        stp	x21, x22, [sp, #352]
-        stp	x23, x24, [sp, #368]
-        stp	x25, x26, [sp, #384]
-        stp	x27, xzr, [sp, #400]
-        mov	x25, x0
-        mov	x26, x1
-        mov	x0, sp
-        ldr	q1, [x26, #96]
-        ldp	x9, x2, [x26, #96]
-        ldr	q0, [x26, #96]
-        ldp	x4, x6, [x26, #112]
-        rev64	v21.4s, v1.4s
-        uzp2	v28.4s, v1.4s, v1.4s
-        umulh	x7, x9, x2
-        xtn	v17.2s, v1.2d
-        mul	v27.4s, v21.4s, v0.4s
-        ldr	q20, [x26, #128]
-        xtn	v30.2s, v0.2d
-        ldr	q1, [x26, #128]
-        uzp2	v31.4s, v0.4s, v0.4s
-        ldp	x5, x10, [x26, #128]
-        umulh	x8, x9, x4
-        uaddlp	v3.2d, v27.4s
-        umull	v16.2d, v30.2s, v17.2s
-        mul	x16, x9, x4
-        umull	v27.2d, v30.2s, v28.2s
-        shrn	v0.2s, v20.2d, #32
-        xtn	v7.2s, v20.2d
-        shl	v20.2d, v3.2d, #32
-        umull	v3.2d, v31.2s, v28.2s
-        mul	x3, x2, x4
-        umlal	v20.2d, v30.2s, v17.2s
-        umull	v22.2d, v7.2s, v0.2s
-        usra	v27.2d, v16.2d, #32
-        umulh	x11, x2, x4
-        movi	v21.2d, #0xffffffff
-        uzp2	v28.4s, v1.4s, v1.4s
-        adds	x15, x16, x7
-        and	v5.16b, v27.16b, v21.16b
-        adcs	x3, x3, x8
-        usra	v3.2d, v27.2d, #32
-        dup	v29.2d, x6
-        adcs	x16, x11, xzr
-        mov	x14, v20.d[0]
-        umlal	v5.2d, v31.2s, v17.2s
-        mul	x8, x9, x2
-        mov	x7, v20.d[1]
-        shl	v19.2d, v22.2d, #33
-        xtn	v25.2s, v29.2d
-        rev64	v31.4s, v1.4s
-        lsl	x13, x14, #32
-        uzp2	v6.4s, v29.4s, v29.4s
-        umlal	v19.2d, v7.2s, v7.2s
-        usra	v3.2d, v5.2d, #32
-        adds	x1, x8, x8
-        umulh	x8, x4, x4
-        add	x12, x13, x14
-        mul	v17.4s, v31.4s, v29.4s
-        xtn	v4.2s, v1.2d
-        adcs	x14, x15, x15
-        lsr	x13, x12, #32
-        adcs	x15, x3, x3
-        umull	v31.2d, v25.2s, v28.2s
-        adcs	x11, x16, x16
-        umull	v21.2d, v25.2s, v4.2s
-        mov	x17, v3.d[0]
-        umull	v18.2d, v6.2s, v28.2s
-        adc	x16, x8, xzr
-        uaddlp	v16.2d, v17.4s
-        movi	v1.2d, #0xffffffff
-        subs	x13, x13, x12
-        usra	v31.2d, v21.2d, #32
-        sbc	x8, x12, xzr
-        adds	x17, x17, x1
-        mul	x1, x4, x4
-        shl	v28.2d, v16.2d, #32
-        mov	x3, v3.d[1]
-        adcs	x14, x7, x14
-        extr	x7, x8, x13, #32
-        adcs	x13, x3, x15
-        and	v3.16b, v31.16b, v1.16b
-        adcs	x11, x1, x11
-        lsr	x1, x8, #32
-        umlal	v3.2d, v6.2s, v4.2s
-        usra	v18.2d, v31.2d, #32
-        adc	x3, x16, xzr
-        adds	x1, x1, x12
-        umlal	v28.2d, v25.2s, v4.2s
-        adc	x16, xzr, xzr
-        subs	x15, x17, x7
-        sbcs	x7, x14, x1
-        lsl	x1, x15, #32
-        sbcs	x16, x13, x16
-        add	x8, x1, x15
-        usra	v18.2d, v3.2d, #32
-        sbcs	x14, x11, xzr
-        lsr	x1, x8, #32
-        sbcs	x17, x3, xzr
-        sbc	x11, x12, xzr
-        subs	x13, x1, x8
-        umulh	x12, x4, x10
-        sbc	x1, x8, xzr
-        extr	x13, x1, x13, #32
-        lsr	x1, x1, #32
-        adds	x15, x1, x8
-        adc	x1, xzr, xzr
-        subs	x7, x7, x13
-        sbcs	x13, x16, x15
-        lsl	x3, x7, #32
-        umulh	x16, x2, x5
-        sbcs	x15, x14, x1
-        add	x7, x3, x7
-        sbcs	x3, x17, xzr
-        lsr	x1, x7, #32
-        sbcs	x14, x11, xzr
-        sbc	x11, x8, xzr
-        subs	x8, x1, x7
-        sbc	x1, x7, xzr
-        extr	x8, x1, x8, #32
-        lsr	x1, x1, #32
-        adds	x1, x1, x7
-        adc	x17, xzr, xzr
-        subs	x13, x13, x8
-        umulh	x8, x9, x6
-        sbcs	x1, x15, x1
-        sbcs	x15, x3, x17
-        sbcs	x3, x14, xzr
-        mul	x17, x2, x5
-        sbcs	x11, x11, xzr
-        stp	x13, x1, [x0]
-        sbc	x14, x7, xzr
-        mul	x7, x4, x10
-        subs	x1, x9, x2
-        stp	x15, x3, [x0, #16]
-        csetm	x15, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        stp	x11, x14, [x0, #32]
-        mul	x14, x9, x6
-        adds	x17, x8, x17
-        adcs	x7, x16, x7
-        adc	x13, x12, xzr
-        subs	x12, x5, x6
-        cneg	x3, x12, cc  // cc = lo, ul, last
-        cinv	x16, x15, cc  // cc = lo, ul, last
-        mul	x8, x1, x3
-        umulh	x1, x1, x3
-        eor	x12, x8, x16
-        adds	x11, x17, x14
-        adcs	x3, x7, x17
-        adcs	x15, x13, x7
-        adc	x8, x13, xzr
-        adds	x3, x3, x14
-        adcs	x15, x15, x17
-        adcs	x17, x8, x7
-        eor	x1, x1, x16
-        adc	x13, x13, xzr
-        subs	x9, x9, x4
-        csetm	x8, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x4, x2, x4
-        cneg	x4, x4, cc  // cc = lo, ul, last
-        csetm	x7, cc  // cc = lo, ul, last
-        subs	x2, x10, x6
-        cinv	x8, x8, cc  // cc = lo, ul, last
-        cneg	x2, x2, cc  // cc = lo, ul, last
-        cmn	x16, #0x1
-        adcs	x11, x11, x12
-        mul	x12, x9, x2
-        adcs	x3, x3, x1
-        adcs	x15, x15, x16
-        umulh	x9, x9, x2
-        adcs	x17, x17, x16
-        adc	x13, x13, x16
-        subs	x1, x10, x5
-        cinv	x2, x7, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        eor	x9, x9, x8
-        cmn	x8, #0x1
-        eor	x7, x12, x8
-        mul	x12, x4, x1
-        adcs	x3, x3, x7
-        adcs	x7, x15, x9
-        adcs	x15, x17, x8
-        ldp	x9, x17, [x0, #16]
-        umulh	x4, x4, x1
-        adc	x8, x13, x8
-        cmn	x2, #0x1
-        eor	x1, x12, x2
-        adcs	x1, x7, x1
-        ldp	x7, x16, [x0]
-        eor	x12, x4, x2
-        adcs	x4, x15, x12
-        ldp	x15, x12, [x0, #32]
-        adc	x8, x8, x2
-        adds	x13, x14, x14
-        umulh	x14, x5, x10
-        adcs	x2, x11, x11
-        adcs	x3, x3, x3
-        adcs	x1, x1, x1
-        adcs	x4, x4, x4
-        adcs	x11, x8, x8
-        adc	x8, xzr, xzr
-        adds	x13, x13, x7
-        adcs	x2, x2, x16
-        mul	x16, x5, x10
-        adcs	x3, x3, x9
-        adcs	x1, x1, x17
-        umulh	x5, x5, x5
-        lsl	x9, x13, #32
-        add	x9, x9, x13
-        adcs	x4, x4, x15
-        mov	x13, v28.d[1]
-        adcs	x15, x11, x12
-        lsr	x7, x9, #32
-        adc	x11, x8, xzr
-        subs	x7, x7, x9
-        umulh	x10, x10, x10
-        sbc	x17, x9, xzr
-        extr	x7, x17, x7, #32
-        lsr	x17, x17, #32
-        adds	x17, x17, x9
-        adc	x12, xzr, xzr
-        subs	x8, x2, x7
-        sbcs	x17, x3, x17
-        lsl	x7, x8, #32
-        sbcs	x2, x1, x12
-        add	x3, x7, x8
-        sbcs	x12, x4, xzr
-        lsr	x1, x3, #32
-        sbcs	x7, x15, xzr
-        sbc	x15, x9, xzr
-        subs	x1, x1, x3
-        sbc	x4, x3, xzr
-        lsr	x9, x4, #32
-        extr	x8, x4, x1, #32
-        adds	x9, x9, x3
-        adc	x4, xzr, xzr
-        subs	x1, x17, x8
-        lsl	x17, x1, #32
-        sbcs	x8, x2, x9
-        sbcs	x9, x12, x4
-        add	x17, x17, x1
-        mov	x1, v18.d[1]
-        lsr	x2, x17, #32
-        sbcs	x7, x7, xzr
-        mov	x12, v18.d[0]
-        sbcs	x15, x15, xzr
-        sbc	x3, x3, xzr
-        subs	x4, x2, x17
-        sbc	x2, x17, xzr
-        adds	x12, x13, x12
-        adcs	x16, x16, x1
-        lsr	x13, x2, #32
-        extr	x1, x2, x4, #32
-        adc	x2, x14, xzr
-        adds	x4, x13, x17
-        mul	x13, x6, x6
-        adc	x14, xzr, xzr
-        subs	x1, x8, x1
-        sbcs	x4, x9, x4
-        mov	x9, v28.d[0]
-        sbcs	x7, x7, x14
-        sbcs	x8, x15, xzr
-        sbcs	x3, x3, xzr
-        sbc	x14, x17, xzr
-        adds	x17, x9, x9
-        adcs	x12, x12, x12
-        mov	x15, v19.d[0]
-        adcs	x9, x16, x16
-        umulh	x6, x6, x6
-        adcs	x16, x2, x2
-        adc	x2, xzr, xzr
-        adds	x11, x11, x8
-        adcs	x3, x3, xzr
-        adcs	x14, x14, xzr
-        adcs	x8, xzr, xzr
-        adds	x13, x1, x13
-        mov	x1, v19.d[1]
-        adcs	x6, x4, x6
-        mov	x4, #0xffffffff            	// #4294967295
-        adcs	x15, x7, x15
-        adcs	x7, x11, x5
-        adcs	x1, x3, x1
-        adcs	x14, x14, x10
-        adc	x11, x8, xzr
-        adds	x6, x6, x17
-        adcs	x8, x15, x12
-        adcs	x3, x7, x9
-        adcs	x15, x1, x16
-        mov	x16, #0xffffffff00000001    	// #-4294967295
-        adcs	x14, x14, x2
-        mov	x2, #0x1                   	// #1
-        adc	x17, x11, xzr
-        cmn	x13, x16
-        adcs	xzr, x6, x4
-        adcs	xzr, x8, x2
-        adcs	xzr, x3, xzr
-        adcs	xzr, x15, xzr
-        adcs	xzr, x14, xzr
-        adc	x1, x17, xzr
-        neg	x9, x1
-        and	x1, x16, x9
-        adds	x11, x13, x1
-        and	x13, x4, x9
-        adcs	x5, x6, x13
-        and	x1, x2, x9
-        adcs	x7, x8, x1
-        stp	x11, x5, [x0]
-        adcs	x11, x3, xzr
-        adcs	x2, x15, xzr
-        stp	x7, x11, [x0, #16]
-        adc	x17, x14, xzr
-        stp	x2, x17, [x0, #32]
-        ldr	q1, [x26, #48]
-        ldp	x9, x2, [x26, #48]
-        ldr	q0, [x26, #48]
-        ldp	x4, x6, [x26, #64]
-        rev64	v21.4s, v1.4s
-        uzp2	v28.4s, v1.4s, v1.4s
-        umulh	x7, x9, x2
-        xtn	v17.2s, v1.2d
-        mul	v27.4s, v21.4s, v0.4s
-        ldr	q20, [x26, #80]
-        xtn	v30.2s, v0.2d
-        ldr	q1, [x26, #80]
-        uzp2	v31.4s, v0.4s, v0.4s
-        ldp	x5, x10, [x26, #80]
-        umulh	x8, x9, x4
-        uaddlp	v3.2d, v27.4s
-        umull	v16.2d, v30.2s, v17.2s
-        mul	x16, x9, x4
-        umull	v27.2d, v30.2s, v28.2s
-        shrn	v0.2s, v20.2d, #32
-        xtn	v7.2s, v20.2d
-        shl	v20.2d, v3.2d, #32
-        umull	v3.2d, v31.2s, v28.2s
-        mul	x3, x2, x4
-        umlal	v20.2d, v30.2s, v17.2s
-        umull	v22.2d, v7.2s, v0.2s
-        usra	v27.2d, v16.2d, #32
-        umulh	x11, x2, x4
-        movi	v21.2d, #0xffffffff
-        uzp2	v28.4s, v1.4s, v1.4s
-        adds	x15, x16, x7
-        and	v5.16b, v27.16b, v21.16b
-        adcs	x3, x3, x8
-        usra	v3.2d, v27.2d, #32
-        dup	v29.2d, x6
-        adcs	x16, x11, xzr
-        mov	x14, v20.d[0]
-        umlal	v5.2d, v31.2s, v17.2s
-        mul	x8, x9, x2
-        mov	x7, v20.d[1]
-        shl	v19.2d, v22.2d, #33
-        xtn	v25.2s, v29.2d
-        rev64	v31.4s, v1.4s
-        lsl	x13, x14, #32
-        uzp2	v6.4s, v29.4s, v29.4s
-        umlal	v19.2d, v7.2s, v7.2s
-        usra	v3.2d, v5.2d, #32
-        adds	x1, x8, x8
-        umulh	x8, x4, x4
-        add	x12, x13, x14
-        mul	v17.4s, v31.4s, v29.4s
-        xtn	v4.2s, v1.2d
-        adcs	x14, x15, x15
-        lsr	x13, x12, #32
-        adcs	x15, x3, x3
-        umull	v31.2d, v25.2s, v28.2s
-        adcs	x11, x16, x16
-        umull	v21.2d, v25.2s, v4.2s
-        mov	x17, v3.d[0]
-        umull	v18.2d, v6.2s, v28.2s
-        adc	x16, x8, xzr
-        uaddlp	v16.2d, v17.4s
-        movi	v1.2d, #0xffffffff
-        subs	x13, x13, x12
-        usra	v31.2d, v21.2d, #32
-        sbc	x8, x12, xzr
-        adds	x17, x17, x1
-        mul	x1, x4, x4
-        shl	v28.2d, v16.2d, #32
-        mov	x3, v3.d[1]
-        adcs	x14, x7, x14
-        extr	x7, x8, x13, #32
-        adcs	x13, x3, x15
-        and	v3.16b, v31.16b, v1.16b
-        adcs	x11, x1, x11
-        lsr	x1, x8, #32
-        umlal	v3.2d, v6.2s, v4.2s
-        usra	v18.2d, v31.2d, #32
-        adc	x3, x16, xzr
-        adds	x1, x1, x12
-        umlal	v28.2d, v25.2s, v4.2s
-        adc	x16, xzr, xzr
-        subs	x15, x17, x7
-        sbcs	x7, x14, x1
-        lsl	x1, x15, #32
-        sbcs	x16, x13, x16
-        add	x8, x1, x15
-        usra	v18.2d, v3.2d, #32
-        sbcs	x14, x11, xzr
-        lsr	x1, x8, #32
-        sbcs	x17, x3, xzr
-        sbc	x11, x12, xzr
-        subs	x13, x1, x8
-        umulh	x12, x4, x10
-        sbc	x1, x8, xzr
-        extr	x13, x1, x13, #32
-        lsr	x1, x1, #32
-        adds	x15, x1, x8
-        adc	x1, xzr, xzr
-        subs	x7, x7, x13
-        sbcs	x13, x16, x15
-        lsl	x3, x7, #32
-        umulh	x16, x2, x5
-        sbcs	x15, x14, x1
-        add	x7, x3, x7
-        sbcs	x3, x17, xzr
-        lsr	x1, x7, #32
-        sbcs	x14, x11, xzr
-        sbc	x11, x8, xzr
-        subs	x8, x1, x7
-        sbc	x1, x7, xzr
-        extr	x8, x1, x8, #32
-        lsr	x1, x1, #32
-        adds	x1, x1, x7
-        adc	x17, xzr, xzr
-        subs	x13, x13, x8
-        umulh	x8, x9, x6
-        sbcs	x1, x15, x1
-        sbcs	x15, x3, x17
-        sbcs	x3, x14, xzr
-        mul	x17, x2, x5
-        sbcs	x11, x11, xzr
-        stp	x13, x1, [sp, #48]
-        sbc	x14, x7, xzr
-        mul	x7, x4, x10
-        subs	x1, x9, x2
-        stp	x15, x3, [sp, #64]
-        csetm	x15, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        stp	x11, x14, [sp, #80]
-        mul	x14, x9, x6
-        adds	x17, x8, x17
-        adcs	x7, x16, x7
-        adc	x13, x12, xzr
-        subs	x12, x5, x6
-        cneg	x3, x12, cc  // cc = lo, ul, last
-        cinv	x16, x15, cc  // cc = lo, ul, last
-        mul	x8, x1, x3
-        umulh	x1, x1, x3
-        eor	x12, x8, x16
-        adds	x11, x17, x14
-        adcs	x3, x7, x17
-        adcs	x15, x13, x7
-        adc	x8, x13, xzr
-        adds	x3, x3, x14
-        adcs	x15, x15, x17
-        adcs	x17, x8, x7
-        eor	x1, x1, x16
-        adc	x13, x13, xzr
-        subs	x9, x9, x4
-        csetm	x8, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x4, x2, x4
-        cneg	x4, x4, cc  // cc = lo, ul, last
-        csetm	x7, cc  // cc = lo, ul, last
-        subs	x2, x10, x6
-        cinv	x8, x8, cc  // cc = lo, ul, last
-        cneg	x2, x2, cc  // cc = lo, ul, last
-        cmn	x16, #0x1
-        adcs	x11, x11, x12
-        mul	x12, x9, x2
-        adcs	x3, x3, x1
-        adcs	x15, x15, x16
-        umulh	x9, x9, x2
-        adcs	x17, x17, x16
-        adc	x13, x13, x16
-        subs	x1, x10, x5
-        cinv	x2, x7, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        eor	x9, x9, x8
-        cmn	x8, #0x1
-        eor	x7, x12, x8
-        mul	x12, x4, x1
-        adcs	x3, x3, x7
-        adcs	x7, x15, x9
-        adcs	x15, x17, x8
-        ldp	x9, x17, [sp, #64]
-        umulh	x4, x4, x1
-        adc	x8, x13, x8
-        cmn	x2, #0x1
-        eor	x1, x12, x2
-        adcs	x1, x7, x1
-        ldp	x7, x16, [sp, #48]
-        eor	x12, x4, x2
-        adcs	x4, x15, x12
-        ldp	x15, x12, [sp, #80]
-        adc	x8, x8, x2
-        adds	x13, x14, x14
-        umulh	x14, x5, x10
-        adcs	x2, x11, x11
-        adcs	x3, x3, x3
-        adcs	x1, x1, x1
-        adcs	x4, x4, x4
-        adcs	x11, x8, x8
-        adc	x8, xzr, xzr
-        adds	x13, x13, x7
-        adcs	x2, x2, x16
-        mul	x16, x5, x10
-        adcs	x3, x3, x9
-        adcs	x1, x1, x17
-        umulh	x5, x5, x5
-        lsl	x9, x13, #32
-        add	x9, x9, x13
-        adcs	x4, x4, x15
-        mov	x13, v28.d[1]
-        adcs	x15, x11, x12
-        lsr	x7, x9, #32
-        adc	x11, x8, xzr
-        subs	x7, x7, x9
-        umulh	x10, x10, x10
-        sbc	x17, x9, xzr
-        extr	x7, x17, x7, #32
-        lsr	x17, x17, #32
-        adds	x17, x17, x9
-        adc	x12, xzr, xzr
-        subs	x8, x2, x7
-        sbcs	x17, x3, x17
-        lsl	x7, x8, #32
-        sbcs	x2, x1, x12
-        add	x3, x7, x8
-        sbcs	x12, x4, xzr
-        lsr	x1, x3, #32
-        sbcs	x7, x15, xzr
-        sbc	x15, x9, xzr
-        subs	x1, x1, x3
-        sbc	x4, x3, xzr
-        lsr	x9, x4, #32
-        extr	x8, x4, x1, #32
-        adds	x9, x9, x3
-        adc	x4, xzr, xzr
-        subs	x1, x17, x8
-        lsl	x17, x1, #32
-        sbcs	x8, x2, x9
-        sbcs	x9, x12, x4
-        add	x17, x17, x1
-        mov	x1, v18.d[1]
-        lsr	x2, x17, #32
-        sbcs	x7, x7, xzr
-        mov	x12, v18.d[0]
-        sbcs	x15, x15, xzr
-        sbc	x3, x3, xzr
-        subs	x4, x2, x17
-        sbc	x2, x17, xzr
-        adds	x12, x13, x12
-        adcs	x16, x16, x1
-        lsr	x13, x2, #32
-        extr	x1, x2, x4, #32
-        adc	x2, x14, xzr
-        adds	x4, x13, x17
-        mul	x13, x6, x6
-        adc	x14, xzr, xzr
-        subs	x1, x8, x1
-        sbcs	x4, x9, x4
-        mov	x9, v28.d[0]
-        sbcs	x7, x7, x14
-        sbcs	x8, x15, xzr
-        sbcs	x3, x3, xzr
-        sbc	x14, x17, xzr
-        adds	x17, x9, x9
-        adcs	x12, x12, x12
-        mov	x15, v19.d[0]
-        adcs	x9, x16, x16
-        umulh	x6, x6, x6
-        adcs	x16, x2, x2
-        adc	x2, xzr, xzr
-        adds	x11, x11, x8
-        adcs	x3, x3, xzr
-        adcs	x14, x14, xzr
-        adcs	x8, xzr, xzr
-        adds	x13, x1, x13
-        mov	x1, v19.d[1]
-        adcs	x6, x4, x6
-        mov	x4, #0xffffffff            	// #4294967295
-        adcs	x15, x7, x15
-        adcs	x7, x11, x5
-        adcs	x1, x3, x1
-        adcs	x14, x14, x10
-        adc	x11, x8, xzr
-        adds	x6, x6, x17
-        adcs	x8, x15, x12
-        adcs	x3, x7, x9
-        adcs	x15, x1, x16
-        mov	x16, #0xffffffff00000001    	// #-4294967295
-        adcs	x14, x14, x2
-        mov	x2, #0x1                   	// #1
-        adc	x17, x11, xzr
-        cmn	x13, x16
-        adcs	xzr, x6, x4
-        adcs	xzr, x8, x2
-        adcs	xzr, x3, xzr
-        adcs	xzr, x15, xzr
-        adcs	xzr, x14, xzr
-        adc	x1, x17, xzr
-        neg	x9, x1
-        and	x1, x16, x9
-        adds	x11, x13, x1
-        and	x13, x4, x9
-        adcs	x5, x6, x13
-        and	x1, x2, x9
-        adcs	x7, x8, x1
-        stp	x11, x5, [sp, #48]
-        adcs	x11, x3, xzr
-        adcs	x2, x15, xzr
-        stp	x7, x11, [sp, #64]
-        adc	x17, x14, xzr
-        stp	x2, x17, [sp, #80]
-        ldp	x5, x6, [x26]
-        ldp	x4, x3, [sp]
-        adds	x5, x5, x4
-        adcs	x6, x6, x3
-        ldp	x7, x8, [x26, #16]
-        ldp	x4, x3, [sp, #16]
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        ldp	x9, x10, [x26, #32]
-        ldp	x4, x3, [sp, #32]
-        adcs	x9, x9, x4
-        adcs	x10, x10, x3
-        csetm	x3, cs  // cs = hs, nlast
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        subs	x5, x5, x4
-        eor	x4, x4, x3
-        sbcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        sbcs	x9, x9, x3
-        sbc	x10, x10, x3
-        stp	x5, x6, [sp, #240]
-        stp	x7, x8, [sp, #256]
-        stp	x9, x10, [sp, #272]
-        mov	x2, sp
-        ldp	x5, x6, [x26]
-        ldp	x4, x3, [x2]
-        subs	x5, x5, x4
-        sbcs	x6, x6, x3
-        ldp	x7, x8, [x26, #16]
-        ldp	x4, x3, [x2, #16]
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        ldp	x9, x10, [x26, #32]
-        ldp	x4, x3, [x2, #32]
-        sbcs	x9, x9, x4
-        sbcs	x10, x10, x3
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x13, x5, x4
-        eor	x4, x4, x3
-        adcs	x23, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x13, x23, [sp, #192]
-        stp	x7, x8, [sp, #208]
-        stp	x9, x10, [sp, #224]
-        ldr	q3, [sp, #240]
-        ldr	q25, [sp, #192]
-        ldp	x3, x21, [sp, #240]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [sp, #224]
-        ldp	x8, x24, [sp, #256]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #272]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [sp, #208]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [sp, #224]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #272]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x16, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x11, x20, x11
-        sbcs	x20, x9, x12
-        stp	x16, x11, [sp, #96]
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #112]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #128]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        ldp	x20, x9, [sp, #96]
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #112]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #128]
-        adds	x20, x22, x20
-        mul	x10, x13, x14
-        adcs	x11, x11, x9
-        eor	x9, x8, x21
-        adcs	x21, x19, x17
-        stp	x20, x11, [sp, #96]
-        adcs	x12, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        stp	x21, x12, [sp, #112]
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #128]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #96]
-        ldp	x21, x12, [sp, #112]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #128]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x21
-        eor	x1, x22, x9
-        adcs	x24, x23, x12
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x21
-        adcs	x15, x17, x12
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #96]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #112]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #128]
-        ldp	x5, x6, [x26, #48]
-        ldp	x4, x3, [x26, #96]
-        adds	x5, x5, x4
-        adcs	x6, x6, x3
-        ldp	x7, x8, [x26, #64]
-        ldp	x4, x3, [x26, #112]
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        ldp	x9, x10, [x26, #80]
-        ldp	x4, x3, [x26, #128]
-        adcs	x9, x9, x4
-        adcs	x10, x10, x3
-        adc	x3, xzr, xzr
-        mov	x4, #0xffffffff            	// #4294967295
-        cmp	x5, x4
-        mov	x4, #0xffffffff00000000    	// #-4294967296
-        sbcs	xzr, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        sbcs	xzr, x7, x4
-        adcs	xzr, x8, xzr
-        adcs	xzr, x9, xzr
-        adcs	xzr, x10, xzr
-        adcs	x3, x3, xzr
-        csetm	x3, ne  // ne = any
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        subs	x5, x5, x4
-        eor	x4, x4, x3
-        sbcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        sbcs	x9, x9, x3
-        sbc	x10, x10, x3
-        stp	x5, x6, [sp, #240]
-        stp	x7, x8, [sp, #256]
-        stp	x9, x10, [sp, #272]
-        ldr	q1, [sp, #96]
-        ldp	x9, x2, [sp, #96]
-        ldr	q0, [sp, #96]
-        ldp	x4, x6, [sp, #112]
-        rev64	v21.4s, v1.4s
-        uzp2	v28.4s, v1.4s, v1.4s
-        umulh	x7, x9, x2
-        xtn	v17.2s, v1.2d
-        mul	v27.4s, v21.4s, v0.4s
-        ldr	q20, [sp, #128]
-        xtn	v30.2s, v0.2d
-        ldr	q1, [sp, #128]
-        uzp2	v31.4s, v0.4s, v0.4s
-        ldp	x5, x10, [sp, #128]
-        umulh	x8, x9, x4
-        uaddlp	v3.2d, v27.4s
-        umull	v16.2d, v30.2s, v17.2s
-        mul	x16, x9, x4
-        umull	v27.2d, v30.2s, v28.2s
-        shrn	v0.2s, v20.2d, #32
-        xtn	v7.2s, v20.2d
-        shl	v20.2d, v3.2d, #32
-        umull	v3.2d, v31.2s, v28.2s
-        mul	x3, x2, x4
-        umlal	v20.2d, v30.2s, v17.2s
-        umull	v22.2d, v7.2s, v0.2s
-        usra	v27.2d, v16.2d, #32
-        umulh	x11, x2, x4
-        movi	v21.2d, #0xffffffff
-        uzp2	v28.4s, v1.4s, v1.4s
-        adds	x15, x16, x7
-        and	v5.16b, v27.16b, v21.16b
-        adcs	x3, x3, x8
-        usra	v3.2d, v27.2d, #32
-        dup	v29.2d, x6
-        adcs	x16, x11, xzr
-        mov	x14, v20.d[0]
-        umlal	v5.2d, v31.2s, v17.2s
-        mul	x8, x9, x2
-        mov	x7, v20.d[1]
-        shl	v19.2d, v22.2d, #33
-        xtn	v25.2s, v29.2d
-        rev64	v31.4s, v1.4s
-        lsl	x13, x14, #32
-        uzp2	v6.4s, v29.4s, v29.4s
-        umlal	v19.2d, v7.2s, v7.2s
-        usra	v3.2d, v5.2d, #32
-        adds	x1, x8, x8
-        umulh	x8, x4, x4
-        add	x12, x13, x14
-        mul	v17.4s, v31.4s, v29.4s
-        xtn	v4.2s, v1.2d
-        adcs	x14, x15, x15
-        lsr	x13, x12, #32
-        adcs	x15, x3, x3
-        umull	v31.2d, v25.2s, v28.2s
-        adcs	x11, x16, x16
-        umull	v21.2d, v25.2s, v4.2s
-        mov	x17, v3.d[0]
-        umull	v18.2d, v6.2s, v28.2s
-        adc	x16, x8, xzr
-        uaddlp	v16.2d, v17.4s
-        movi	v1.2d, #0xffffffff
-        subs	x13, x13, x12
-        usra	v31.2d, v21.2d, #32
-        sbc	x8, x12, xzr
-        adds	x17, x17, x1
-        mul	x1, x4, x4
-        shl	v28.2d, v16.2d, #32
-        mov	x3, v3.d[1]
-        adcs	x14, x7, x14
-        extr	x7, x8, x13, #32
-        adcs	x13, x3, x15
-        and	v3.16b, v31.16b, v1.16b
-        adcs	x11, x1, x11
-        lsr	x1, x8, #32
-        umlal	v3.2d, v6.2s, v4.2s
-        usra	v18.2d, v31.2d, #32
-        adc	x3, x16, xzr
-        adds	x1, x1, x12
-        umlal	v28.2d, v25.2s, v4.2s
-        adc	x16, xzr, xzr
-        subs	x15, x17, x7
-        sbcs	x7, x14, x1
-        lsl	x1, x15, #32
-        sbcs	x16, x13, x16
-        add	x8, x1, x15
-        usra	v18.2d, v3.2d, #32
-        sbcs	x14, x11, xzr
-        lsr	x1, x8, #32
-        sbcs	x17, x3, xzr
-        sbc	x11, x12, xzr
-        subs	x13, x1, x8
-        umulh	x12, x4, x10
-        sbc	x1, x8, xzr
-        extr	x13, x1, x13, #32
-        lsr	x1, x1, #32
-        adds	x15, x1, x8
-        adc	x1, xzr, xzr
-        subs	x7, x7, x13
-        sbcs	x13, x16, x15
-        lsl	x3, x7, #32
-        umulh	x16, x2, x5
-        sbcs	x15, x14, x1
-        add	x7, x3, x7
-        sbcs	x3, x17, xzr
-        lsr	x1, x7, #32
-        sbcs	x14, x11, xzr
-        sbc	x11, x8, xzr
-        subs	x8, x1, x7
-        sbc	x1, x7, xzr
-        extr	x8, x1, x8, #32
-        lsr	x1, x1, #32
-        adds	x1, x1, x7
-        adc	x17, xzr, xzr
-        subs	x13, x13, x8
-        umulh	x8, x9, x6
-        sbcs	x1, x15, x1
-        sbcs	x15, x3, x17
-        sbcs	x3, x14, xzr
-        mul	x17, x2, x5
-        sbcs	x11, x11, xzr
-        stp	x13, x1, [sp, #288]
-        sbc	x14, x7, xzr
-        mul	x7, x4, x10
-        subs	x1, x9, x2
-        stp	x15, x3, [sp, #304]
-        csetm	x15, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        stp	x11, x14, [sp, #320]
-        mul	x14, x9, x6
-        adds	x17, x8, x17
-        adcs	x7, x16, x7
-        adc	x13, x12, xzr
-        subs	x12, x5, x6
-        cneg	x3, x12, cc  // cc = lo, ul, last
-        cinv	x16, x15, cc  // cc = lo, ul, last
-        mul	x8, x1, x3
-        umulh	x1, x1, x3
-        eor	x12, x8, x16
-        adds	x11, x17, x14
-        adcs	x3, x7, x17
-        adcs	x15, x13, x7
-        adc	x8, x13, xzr
-        adds	x3, x3, x14
-        adcs	x15, x15, x17
-        adcs	x17, x8, x7
-        eor	x1, x1, x16
-        adc	x13, x13, xzr
-        subs	x9, x9, x4
-        csetm	x8, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x4, x2, x4
-        cneg	x4, x4, cc  // cc = lo, ul, last
-        csetm	x7, cc  // cc = lo, ul, last
-        subs	x2, x10, x6
-        cinv	x8, x8, cc  // cc = lo, ul, last
-        cneg	x2, x2, cc  // cc = lo, ul, last
-        cmn	x16, #0x1
-        adcs	x11, x11, x12
-        mul	x12, x9, x2
-        adcs	x3, x3, x1
-        adcs	x15, x15, x16
-        umulh	x9, x9, x2
-        adcs	x17, x17, x16
-        adc	x13, x13, x16
-        subs	x1, x10, x5
-        cinv	x2, x7, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        eor	x9, x9, x8
-        cmn	x8, #0x1
-        eor	x7, x12, x8
-        mul	x12, x4, x1
-        adcs	x3, x3, x7
-        adcs	x7, x15, x9
-        adcs	x15, x17, x8
-        ldp	x9, x17, [sp, #304]
-        umulh	x4, x4, x1
-        adc	x8, x13, x8
-        cmn	x2, #0x1
-        eor	x1, x12, x2
-        adcs	x1, x7, x1
-        ldp	x7, x16, [sp, #288]
-        eor	x12, x4, x2
-        adcs	x4, x15, x12
-        ldp	x15, x12, [sp, #320]
-        adc	x8, x8, x2
-        adds	x13, x14, x14
-        umulh	x14, x5, x10
-        adcs	x2, x11, x11
-        adcs	x3, x3, x3
-        adcs	x1, x1, x1
-        adcs	x4, x4, x4
-        adcs	x11, x8, x8
-        adc	x8, xzr, xzr
-        adds	x13, x13, x7
-        adcs	x2, x2, x16
-        mul	x16, x5, x10
-        adcs	x3, x3, x9
-        adcs	x1, x1, x17
-        umulh	x5, x5, x5
-        lsl	x9, x13, #32
-        add	x9, x9, x13
-        adcs	x4, x4, x15
-        mov	x13, v28.d[1]
-        adcs	x15, x11, x12
-        lsr	x7, x9, #32
-        adc	x11, x8, xzr
-        subs	x7, x7, x9
-        umulh	x10, x10, x10
-        sbc	x17, x9, xzr
-        extr	x7, x17, x7, #32
-        lsr	x17, x17, #32
-        adds	x17, x17, x9
-        adc	x12, xzr, xzr
-        subs	x8, x2, x7
-        sbcs	x17, x3, x17
-        lsl	x7, x8, #32
-        sbcs	x2, x1, x12
-        add	x3, x7, x8
-        sbcs	x12, x4, xzr
-        lsr	x1, x3, #32
-        sbcs	x7, x15, xzr
-        sbc	x15, x9, xzr
-        subs	x1, x1, x3
-        sbc	x4, x3, xzr
-        lsr	x9, x4, #32
-        extr	x8, x4, x1, #32
-        adds	x9, x9, x3
-        adc	x4, xzr, xzr
-        subs	x1, x17, x8
-        lsl	x17, x1, #32
-        sbcs	x8, x2, x9
-        sbcs	x9, x12, x4
-        add	x17, x17, x1
-        mov	x1, v18.d[1]
-        lsr	x2, x17, #32
-        sbcs	x7, x7, xzr
-        mov	x12, v18.d[0]
-        sbcs	x15, x15, xzr
-        sbc	x3, x3, xzr
-        subs	x4, x2, x17
-        sbc	x2, x17, xzr
-        adds	x12, x13, x12
-        adcs	x16, x16, x1
-        lsr	x13, x2, #32
-        extr	x1, x2, x4, #32
-        adc	x2, x14, xzr
-        adds	x4, x13, x17
-        mul	x13, x6, x6
-        adc	x14, xzr, xzr
-        subs	x1, x8, x1
-        sbcs	x4, x9, x4
-        mov	x9, v28.d[0]
-        sbcs	x7, x7, x14
-        sbcs	x8, x15, xzr
-        sbcs	x3, x3, xzr
-        sbc	x14, x17, xzr
-        adds	x17, x9, x9
-        adcs	x12, x12, x12
-        mov	x15, v19.d[0]
-        adcs	x9, x16, x16
-        umulh	x6, x6, x6
-        adcs	x16, x2, x2
-        adc	x2, xzr, xzr
-        adds	x11, x11, x8
-        adcs	x3, x3, xzr
-        adcs	x14, x14, xzr
-        adcs	x8, xzr, xzr
-        adds	x13, x1, x13
-        mov	x1, v19.d[1]
-        adcs	x6, x4, x6
-        mov	x4, #0xffffffff            	// #4294967295
-        adcs	x15, x7, x15
-        adcs	x7, x11, x5
-        adcs	x1, x3, x1
-        adcs	x14, x14, x10
-        adc	x11, x8, xzr
-        adds	x6, x6, x17
-        adcs	x8, x15, x12
-        adcs	x3, x7, x9
-        adcs	x15, x1, x16
-        mov	x16, #0xffffffff00000001    	// #-4294967295
-        adcs	x14, x14, x2
-        mov	x2, #0x1                   	// #1
-        adc	x17, x11, xzr
-        cmn	x13, x16
-        adcs	xzr, x6, x4
-        adcs	xzr, x8, x2
-        adcs	xzr, x3, xzr
-        adcs	xzr, x15, xzr
-        adcs	xzr, x14, xzr
-        adc	x1, x17, xzr
-        neg	x9, x1
-        and	x1, x16, x9
-        adds	x11, x13, x1
-        and	x13, x4, x9
-        adcs	x5, x6, x13
-        and	x1, x2, x9
-        adcs	x7, x8, x1
-        stp	x11, x5, [sp, #288]
-        adcs	x11, x3, xzr
-        adcs	x2, x15, xzr
-        stp	x7, x11, [sp, #304]
-        adc	x17, x14, xzr
-        stp	x2, x17, [sp, #320]
-        ldr	q3, [x26]
-        ldr	q25, [sp, #48]
-        ldp	x13, x23, [sp, #48]
-        ldp	x3, x21, [x26]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [sp, #80]
-        ldp	x8, x24, [x26, #16]
-        subs	x6, x3, x21
-        ldr	q0, [x26, #32]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [sp, #64]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [sp, #80]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [x26, #32]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x26, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x27, x20, x11
-        sbcs	x20, x9, x12
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #160]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #176]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #160]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #176]
-        adds	x20, x22, x26
-        mul	x10, x13, x14
-        adcs	x11, x11, x27
-        eor	x9, x8, x21
-        adcs	x26, x19, x17
-        stp	x20, x11, [sp, #144]
-        adcs	x27, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #176]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #144]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #176]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x26
-        eor	x1, x22, x9
-        adcs	x24, x23, x27
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x26
-        adcs	x15, x17, x27
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #144]
-        adcs	x5, x13, x23
-        adcs	x21, x8, x23
-        stp	x14, x5, [sp, #160]
-        adc	x12, x15, x23
-        stp	x21, x12, [sp, #176]
-        ldr	q1, [sp, #240]
-        ldp	x9, x2, [sp, #240]
-        ldr	q0, [sp, #240]
-        ldp	x4, x6, [sp, #256]
-        rev64	v21.4s, v1.4s
-        uzp2	v28.4s, v1.4s, v1.4s
-        umulh	x7, x9, x2
-        xtn	v17.2s, v1.2d
-        mul	v27.4s, v21.4s, v0.4s
-        ldr	q20, [sp, #272]
-        xtn	v30.2s, v0.2d
-        ldr	q1, [sp, #272]
-        uzp2	v31.4s, v0.4s, v0.4s
-        ldp	x5, x10, [sp, #272]
-        umulh	x8, x9, x4
-        uaddlp	v3.2d, v27.4s
-        umull	v16.2d, v30.2s, v17.2s
-        mul	x16, x9, x4
-        umull	v27.2d, v30.2s, v28.2s
-        shrn	v0.2s, v20.2d, #32
-        xtn	v7.2s, v20.2d
-        shl	v20.2d, v3.2d, #32
-        umull	v3.2d, v31.2s, v28.2s
-        mul	x3, x2, x4
-        umlal	v20.2d, v30.2s, v17.2s
-        umull	v22.2d, v7.2s, v0.2s
-        usra	v27.2d, v16.2d, #32
-        umulh	x11, x2, x4
-        movi	v21.2d, #0xffffffff
-        uzp2	v28.4s, v1.4s, v1.4s
-        adds	x15, x16, x7
-        and	v5.16b, v27.16b, v21.16b
-        adcs	x3, x3, x8
-        usra	v3.2d, v27.2d, #32
-        dup	v29.2d, x6
-        adcs	x16, x11, xzr
-        mov	x14, v20.d[0]
-        umlal	v5.2d, v31.2s, v17.2s
-        mul	x8, x9, x2
-        mov	x7, v20.d[1]
-        shl	v19.2d, v22.2d, #33
-        xtn	v25.2s, v29.2d
-        rev64	v31.4s, v1.4s
-        lsl	x13, x14, #32
-        uzp2	v6.4s, v29.4s, v29.4s
-        umlal	v19.2d, v7.2s, v7.2s
-        usra	v3.2d, v5.2d, #32
-        adds	x1, x8, x8
-        umulh	x8, x4, x4
-        add	x12, x13, x14
-        mul	v17.4s, v31.4s, v29.4s
-        xtn	v4.2s, v1.2d
-        adcs	x14, x15, x15
-        lsr	x13, x12, #32
-        adcs	x15, x3, x3
-        umull	v31.2d, v25.2s, v28.2s
-        adcs	x11, x16, x16
-        umull	v21.2d, v25.2s, v4.2s
-        mov	x17, v3.d[0]
-        umull	v18.2d, v6.2s, v28.2s
-        adc	x16, x8, xzr
-        uaddlp	v16.2d, v17.4s
-        movi	v1.2d, #0xffffffff
-        subs	x13, x13, x12
-        usra	v31.2d, v21.2d, #32
-        sbc	x8, x12, xzr
-        adds	x17, x17, x1
-        mul	x1, x4, x4
-        shl	v28.2d, v16.2d, #32
-        mov	x3, v3.d[1]
-        adcs	x14, x7, x14
-        extr	x7, x8, x13, #32
-        adcs	x13, x3, x15
-        and	v3.16b, v31.16b, v1.16b
-        adcs	x11, x1, x11
-        lsr	x1, x8, #32
-        umlal	v3.2d, v6.2s, v4.2s
-        usra	v18.2d, v31.2d, #32
-        adc	x3, x16, xzr
-        adds	x1, x1, x12
-        umlal	v28.2d, v25.2s, v4.2s
-        adc	x16, xzr, xzr
-        subs	x15, x17, x7
-        sbcs	x7, x14, x1
-        lsl	x1, x15, #32
-        sbcs	x16, x13, x16
-        add	x8, x1, x15
-        usra	v18.2d, v3.2d, #32
-        sbcs	x14, x11, xzr
-        lsr	x1, x8, #32
-        sbcs	x17, x3, xzr
-        sbc	x11, x12, xzr
-        subs	x13, x1, x8
-        umulh	x12, x4, x10
-        sbc	x1, x8, xzr
-        extr	x13, x1, x13, #32
-        lsr	x1, x1, #32
-        adds	x15, x1, x8
-        adc	x1, xzr, xzr
-        subs	x7, x7, x13
-        sbcs	x13, x16, x15
-        lsl	x3, x7, #32
-        umulh	x16, x2, x5
-        sbcs	x15, x14, x1
-        add	x7, x3, x7
-        sbcs	x3, x17, xzr
-        lsr	x1, x7, #32
-        sbcs	x14, x11, xzr
-        sbc	x11, x8, xzr
-        subs	x8, x1, x7
-        sbc	x1, x7, xzr
-        extr	x8, x1, x8, #32
-        lsr	x1, x1, #32
-        adds	x1, x1, x7
-        adc	x17, xzr, xzr
-        subs	x13, x13, x8
-        umulh	x8, x9, x6
-        sbcs	x1, x15, x1
-        sbcs	x19, x3, x17
-        sbcs	x20, x14, xzr
-        mul	x17, x2, x5
-        sbcs	x11, x11, xzr
-        stp	x13, x1, [sp, #192]
-        sbc	x14, x7, xzr
-        mul	x7, x4, x10
-        subs	x1, x9, x2
-        csetm	x15, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        stp	x11, x14, [sp, #224]
-        mul	x14, x9, x6
-        adds	x17, x8, x17
-        adcs	x7, x16, x7
-        adc	x13, x12, xzr
-        subs	x12, x5, x6
-        cneg	x3, x12, cc  // cc = lo, ul, last
-        cinv	x16, x15, cc  // cc = lo, ul, last
-        mul	x8, x1, x3
-        umulh	x1, x1, x3
-        eor	x12, x8, x16
-        adds	x11, x17, x14
-        adcs	x3, x7, x17
-        adcs	x15, x13, x7
-        adc	x8, x13, xzr
-        adds	x3, x3, x14
-        adcs	x15, x15, x17
-        adcs	x17, x8, x7
-        eor	x1, x1, x16
-        adc	x13, x13, xzr
-        subs	x9, x9, x4
-        csetm	x8, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x4, x2, x4
-        cneg	x4, x4, cc  // cc = lo, ul, last
-        csetm	x7, cc  // cc = lo, ul, last
-        subs	x2, x10, x6
-        cinv	x8, x8, cc  // cc = lo, ul, last
-        cneg	x2, x2, cc  // cc = lo, ul, last
-        cmn	x16, #0x1
-        adcs	x11, x11, x12
-        mul	x12, x9, x2
-        adcs	x3, x3, x1
-        adcs	x15, x15, x16
-        umulh	x9, x9, x2
-        adcs	x17, x17, x16
-        adc	x13, x13, x16
-        subs	x1, x10, x5
-        cinv	x2, x7, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        eor	x9, x9, x8
-        cmn	x8, #0x1
-        eor	x7, x12, x8
-        mul	x12, x4, x1
-        adcs	x3, x3, x7
-        adcs	x7, x15, x9
-        adcs	x15, x17, x8
-        umulh	x4, x4, x1
-        adc	x8, x13, x8
-        cmn	x2, #0x1
-        eor	x1, x12, x2
-        adcs	x1, x7, x1
-        ldp	x7, x16, [sp, #192]
-        eor	x12, x4, x2
-        adcs	x4, x15, x12
-        ldp	x15, x12, [sp, #224]
-        adc	x8, x8, x2
-        adds	x13, x14, x14
-        umulh	x14, x5, x10
-        adcs	x2, x11, x11
-        adcs	x3, x3, x3
-        adcs	x1, x1, x1
-        adcs	x4, x4, x4
-        adcs	x11, x8, x8
-        adc	x8, xzr, xzr
-        adds	x13, x13, x7
-        adcs	x2, x2, x16
-        mul	x16, x5, x10
-        adcs	x3, x3, x19
-        adcs	x1, x1, x20
-        umulh	x5, x5, x5
-        lsl	x9, x13, #32
-        add	x9, x9, x13
-        adcs	x4, x4, x15
-        mov	x13, v28.d[1]
-        adcs	x15, x11, x12
-        lsr	x7, x9, #32
-        adc	x11, x8, xzr
-        subs	x7, x7, x9
-        umulh	x10, x10, x10
-        sbc	x17, x9, xzr
-        extr	x7, x17, x7, #32
-        lsr	x17, x17, #32
-        adds	x17, x17, x9
-        adc	x12, xzr, xzr
-        subs	x8, x2, x7
-        sbcs	x17, x3, x17
-        lsl	x7, x8, #32
-        sbcs	x2, x1, x12
-        add	x3, x7, x8
-        sbcs	x12, x4, xzr
-        lsr	x1, x3, #32
-        sbcs	x7, x15, xzr
-        sbc	x15, x9, xzr
-        subs	x1, x1, x3
-        sbc	x4, x3, xzr
-        lsr	x9, x4, #32
-        extr	x8, x4, x1, #32
-        adds	x9, x9, x3
-        adc	x4, xzr, xzr
-        subs	x1, x17, x8
-        lsl	x17, x1, #32
-        sbcs	x8, x2, x9
-        sbcs	x9, x12, x4
-        add	x17, x17, x1
-        mov	x1, v18.d[1]
-        lsr	x2, x17, #32
-        sbcs	x7, x7, xzr
-        mov	x12, v18.d[0]
-        sbcs	x15, x15, xzr
-        sbc	x3, x3, xzr
-        subs	x4, x2, x17
-        sbc	x2, x17, xzr
-        adds	x12, x13, x12
-        adcs	x16, x16, x1
-        lsr	x13, x2, #32
-        extr	x1, x2, x4, #32
-        adc	x2, x14, xzr
-        adds	x4, x13, x17
-        mul	x13, x6, x6
-        adc	x14, xzr, xzr
-        subs	x1, x8, x1
-        sbcs	x4, x9, x4
-        mov	x9, v28.d[0]
-        sbcs	x7, x7, x14
-        sbcs	x8, x15, xzr
-        sbcs	x3, x3, xzr
-        sbc	x14, x17, xzr
-        adds	x17, x9, x9
-        adcs	x12, x12, x12
-        mov	x15, v19.d[0]
-        adcs	x9, x16, x16
-        umulh	x6, x6, x6
-        adcs	x16, x2, x2
-        adc	x2, xzr, xzr
-        adds	x11, x11, x8
-        adcs	x3, x3, xzr
-        adcs	x14, x14, xzr
-        adcs	x8, xzr, xzr
-        adds	x13, x1, x13
-        mov	x1, v19.d[1]
-        adcs	x6, x4, x6
-        mov	x4, #0xffffffff            	// #4294967295
-        adcs	x15, x7, x15
-        adcs	x7, x11, x5
-        adcs	x1, x3, x1
-        adcs	x14, x14, x10
-        adc	x11, x8, xzr
-        adds	x6, x6, x17
-        adcs	x8, x15, x12
-        adcs	x3, x7, x9
-        adcs	x15, x1, x16
-        mov	x16, #0xffffffff00000001    	// #-4294967295
-        adcs	x14, x14, x2
-        mov	x2, #0x1                   	// #1
-        adc	x17, x11, xzr
-        cmn	x13, x16
-        adcs	xzr, x6, x4
-        adcs	xzr, x8, x2
-        adcs	xzr, x3, xzr
-        adcs	xzr, x15, xzr
-        adcs	xzr, x14, xzr
-        adc	x1, x17, xzr
-        neg	x9, x1
-        and	x1, x16, x9
-        adds	x19, x13, x1
-        and	x13, x4, x9
-        adcs	x20, x6, x13
-        and	x1, x2, x9
-        adcs	x7, x8, x1
-        adcs	x11, x3, xzr
-        adcs	x2, x15, xzr
-        stp	x7, x11, [sp, #208]
-        adc	x17, x14, xzr
-        stp	x2, x17, [sp, #224]
-        ldp	x0, x1, [sp, #288]
-        mov	x6, #0xffffffff            	// #4294967295
-        subs	x6, x6, x0
-        mov	x7, #0xffffffff00000000    	// #-4294967296
-        sbcs	x7, x7, x1
-        ldp	x0, x1, [sp, #304]
-        mov	x8, #0xfffffffffffffffe    	// #-2
-        sbcs	x8, x8, x0
-        mov	x13, #0xffffffffffffffff    	// #-1
-        sbcs	x9, x13, x1
-        ldp	x0, x1, [sp, #320]
-        sbcs	x10, x13, x0
-        sbc	x11, x13, x1
-        mov	x12, #0x9                   	// #9
-        mul	x0, x12, x6
-        mul	x1, x12, x7
-        mul	x2, x12, x8
-        mul	x3, x12, x9
-        mul	x4, x12, x10
-        mul	x5, x12, x11
-        umulh	x6, x12, x6
-        umulh	x7, x12, x7
-        umulh	x8, x12, x8
-        umulh	x9, x12, x9
-        umulh	x10, x12, x10
-        umulh	x12, x12, x11
-        adds	x1, x1, x6
-        adcs	x2, x2, x7
-        adcs	x3, x3, x8
-        adcs	x4, x4, x9
-        adcs	x5, x5, x10
-        mov	x6, #0x1                   	// #1
-        adc	x6, x12, x6
-        ldp	x8, x9, [sp, #144]
-        ldp	x10, x11, [sp, #160]
-        ldp	x12, x13, [sp, #176]
-        mov	x14, #0xc                   	// #12
-        mul	x15, x14, x8
-        umulh	x8, x14, x8
-        adds	x0, x0, x15
-        mul	x15, x14, x9
-        umulh	x9, x14, x9
-        adcs	x1, x1, x15
-        mul	x15, x14, x10
-        umulh	x10, x14, x10
-        adcs	x2, x2, x15
-        mul	x15, x14, x11
-        umulh	x11, x14, x11
-        adcs	x3, x3, x15
-        mul	x15, x14, x12
-        umulh	x12, x14, x12
-        adcs	x4, x4, x15
-        mul	x15, x14, x13
-        umulh	x13, x14, x13
-        adcs	x5, x5, x15
-        adc	x6, x6, xzr
-        adds	x1, x1, x8
-        adcs	x2, x2, x9
-        adcs	x3, x3, x10
-        adcs	x4, x4, x11
-        adcs	x5, x5, x12
-        adcs	x6, x6, x13
-        lsl	x7, x6, #32
-        subs	x8, x6, x7
-        sbc	x7, x7, xzr
-        adds	x0, x0, x8
-        adcs	x1, x1, x7
-        adcs	x2, x2, x6
-        adcs	x3, x3, xzr
-        adcs	x4, x4, xzr
-        adcs	x5, x5, xzr
-        csetm	x6, cc  // cc = lo, ul, last
-        mov	x7, #0xffffffff            	// #4294967295
-        and	x7, x7, x6
-        adds	x0, x0, x7
-        eor	x7, x7, x6
-        adcs	x1, x1, x7
-        mov	x7, #0xfffffffffffffffe    	// #-2
-        and	x7, x7, x6
-        adcs	x2, x2, x7
-        adcs	x3, x3, x6
-        adcs	x4, x4, x6
-        adc	x5, x5, x6
-        stp	x0, x1, [sp, #288]
-        stp	x2, x3, [sp, #304]
-        stp	x4, x5, [sp, #320]
-        mov	x2, sp
-        ldp	x4, x3, [x2]
-        subs	x5, x19, x4
-        sbcs	x6, x20, x3
-        ldp	x7, x8, [sp, #208]
-        ldp	x4, x3, [x2, #16]
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        ldp	x9, x10, [sp, #224]
-        ldp	x4, x3, [x2, #32]
-        sbcs	x9, x9, x4
-        sbcs	x10, x10, x3
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x5, x5, x4
-        eor	x4, x4, x3
-        adcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x5, x6, [sp, #240]
-        stp	x7, x8, [sp, #256]
-        stp	x9, x10, [sp, #272]
-        ldr	q1, [sp, #48]
-        ldp	x9, x2, [sp, #48]
-        ldr	q0, [sp, #48]
-        ldp	x4, x6, [sp, #64]
-        rev64	v21.4s, v1.4s
-        uzp2	v28.4s, v1.4s, v1.4s
-        umulh	x7, x9, x2
-        xtn	v17.2s, v1.2d
-        mul	v27.4s, v21.4s, v0.4s
-        ldr	q20, [sp, #80]
-        xtn	v30.2s, v0.2d
-        ldr	q1, [sp, #80]
-        uzp2	v31.4s, v0.4s, v0.4s
-        ldp	x5, x10, [sp, #80]
-        umulh	x8, x9, x4
-        uaddlp	v3.2d, v27.4s
-        umull	v16.2d, v30.2s, v17.2s
-        mul	x16, x9, x4
-        umull	v27.2d, v30.2s, v28.2s
-        shrn	v0.2s, v20.2d, #32
-        xtn	v7.2s, v20.2d
-        shl	v20.2d, v3.2d, #32
-        umull	v3.2d, v31.2s, v28.2s
-        mul	x3, x2, x4
-        umlal	v20.2d, v30.2s, v17.2s
-        umull	v22.2d, v7.2s, v0.2s
-        usra	v27.2d, v16.2d, #32
-        umulh	x11, x2, x4
-        movi	v21.2d, #0xffffffff
-        uzp2	v28.4s, v1.4s, v1.4s
-        adds	x15, x16, x7
-        and	v5.16b, v27.16b, v21.16b
-        adcs	x3, x3, x8
-        usra	v3.2d, v27.2d, #32
-        dup	v29.2d, x6
-        adcs	x16, x11, xzr
-        mov	x14, v20.d[0]
-        umlal	v5.2d, v31.2s, v17.2s
-        mul	x8, x9, x2
-        mov	x7, v20.d[1]
-        shl	v19.2d, v22.2d, #33
-        xtn	v25.2s, v29.2d
-        rev64	v31.4s, v1.4s
-        lsl	x13, x14, #32
-        uzp2	v6.4s, v29.4s, v29.4s
-        umlal	v19.2d, v7.2s, v7.2s
-        usra	v3.2d, v5.2d, #32
-        adds	x1, x8, x8
-        umulh	x8, x4, x4
-        add	x12, x13, x14
-        mul	v17.4s, v31.4s, v29.4s
-        xtn	v4.2s, v1.2d
-        adcs	x14, x15, x15
-        lsr	x13, x12, #32
-        adcs	x15, x3, x3
-        umull	v31.2d, v25.2s, v28.2s
-        adcs	x11, x16, x16
-        umull	v21.2d, v25.2s, v4.2s
-        mov	x17, v3.d[0]
-        umull	v18.2d, v6.2s, v28.2s
-        adc	x16, x8, xzr
-        uaddlp	v16.2d, v17.4s
-        movi	v1.2d, #0xffffffff
-        subs	x13, x13, x12
-        usra	v31.2d, v21.2d, #32
-        sbc	x8, x12, xzr
-        adds	x17, x17, x1
-        mul	x1, x4, x4
-        shl	v28.2d, v16.2d, #32
-        mov	x3, v3.d[1]
-        adcs	x14, x7, x14
-        extr	x7, x8, x13, #32
-        adcs	x13, x3, x15
-        and	v3.16b, v31.16b, v1.16b
-        adcs	x11, x1, x11
-        lsr	x1, x8, #32
-        umlal	v3.2d, v6.2s, v4.2s
-        usra	v18.2d, v31.2d, #32
-        adc	x3, x16, xzr
-        adds	x1, x1, x12
-        umlal	v28.2d, v25.2s, v4.2s
-        adc	x16, xzr, xzr
-        subs	x15, x17, x7
-        sbcs	x7, x14, x1
-        lsl	x1, x15, #32
-        sbcs	x16, x13, x16
-        add	x8, x1, x15
-        usra	v18.2d, v3.2d, #32
-        sbcs	x14, x11, xzr
-        lsr	x1, x8, #32
-        sbcs	x17, x3, xzr
-        sbc	x11, x12, xzr
-        subs	x13, x1, x8
-        umulh	x12, x4, x10
-        sbc	x1, x8, xzr
-        extr	x13, x1, x13, #32
-        lsr	x1, x1, #32
-        adds	x15, x1, x8
-        adc	x1, xzr, xzr
-        subs	x7, x7, x13
-        sbcs	x13, x16, x15
-        lsl	x3, x7, #32
-        umulh	x16, x2, x5
-        sbcs	x15, x14, x1
-        add	x7, x3, x7
-        sbcs	x3, x17, xzr
-        lsr	x1, x7, #32
-        sbcs	x14, x11, xzr
-        sbc	x11, x8, xzr
-        subs	x8, x1, x7
-        sbc	x1, x7, xzr
-        extr	x8, x1, x8, #32
-        lsr	x1, x1, #32
-        adds	x1, x1, x7
-        adc	x17, xzr, xzr
-        subs	x13, x13, x8
-        umulh	x8, x9, x6
-        sbcs	x1, x15, x1
-        sbcs	x19, x3, x17
-        sbcs	x20, x14, xzr
-        mul	x17, x2, x5
-        sbcs	x11, x11, xzr
-        stp	x13, x1, [sp, #192]
-        sbc	x14, x7, xzr
-        mul	x7, x4, x10
-        subs	x1, x9, x2
-        csetm	x15, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        stp	x11, x14, [sp, #224]
-        mul	x14, x9, x6
-        adds	x17, x8, x17
-        adcs	x7, x16, x7
-        adc	x13, x12, xzr
-        subs	x12, x5, x6
-        cneg	x3, x12, cc  // cc = lo, ul, last
-        cinv	x16, x15, cc  // cc = lo, ul, last
-        mul	x8, x1, x3
-        umulh	x1, x1, x3
-        eor	x12, x8, x16
-        adds	x11, x17, x14
-        adcs	x3, x7, x17
-        adcs	x15, x13, x7
-        adc	x8, x13, xzr
-        adds	x3, x3, x14
-        adcs	x15, x15, x17
-        adcs	x17, x8, x7
-        eor	x1, x1, x16
-        adc	x13, x13, xzr
-        subs	x9, x9, x4
-        csetm	x8, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x4, x2, x4
-        cneg	x4, x4, cc  // cc = lo, ul, last
-        csetm	x7, cc  // cc = lo, ul, last
-        subs	x2, x10, x6
-        cinv	x8, x8, cc  // cc = lo, ul, last
-        cneg	x2, x2, cc  // cc = lo, ul, last
-        cmn	x16, #0x1
-        adcs	x11, x11, x12
-        mul	x12, x9, x2
-        adcs	x3, x3, x1
-        adcs	x15, x15, x16
-        umulh	x9, x9, x2
-        adcs	x17, x17, x16
-        adc	x13, x13, x16
-        subs	x1, x10, x5
-        cinv	x2, x7, cc  // cc = lo, ul, last
-        cneg	x1, x1, cc  // cc = lo, ul, last
-        eor	x9, x9, x8
-        cmn	x8, #0x1
-        eor	x7, x12, x8
-        mul	x12, x4, x1
-        adcs	x3, x3, x7
-        adcs	x7, x15, x9
-        adcs	x15, x17, x8
-        umulh	x4, x4, x1
-        adc	x8, x13, x8
-        cmn	x2, #0x1
-        eor	x1, x12, x2
-        adcs	x1, x7, x1
-        ldp	x7, x16, [sp, #192]
-        eor	x12, x4, x2
-        adcs	x4, x15, x12
-        ldp	x15, x12, [sp, #224]
-        adc	x8, x8, x2
-        adds	x13, x14, x14
-        umulh	x14, x5, x10
-        adcs	x2, x11, x11
-        adcs	x3, x3, x3
-        adcs	x1, x1, x1
-        adcs	x4, x4, x4
-        adcs	x11, x8, x8
-        adc	x8, xzr, xzr
-        adds	x13, x13, x7
-        adcs	x2, x2, x16
-        mul	x16, x5, x10
-        adcs	x3, x3, x19
-        adcs	x1, x1, x20
-        umulh	x5, x5, x5
-        lsl	x9, x13, #32
-        add	x9, x9, x13
-        adcs	x4, x4, x15
-        mov	x13, v28.d[1]
-        adcs	x15, x11, x12
-        lsr	x7, x9, #32
-        adc	x11, x8, xzr
-        subs	x7, x7, x9
-        umulh	x10, x10, x10
-        sbc	x17, x9, xzr
-        extr	x7, x17, x7, #32
-        lsr	x17, x17, #32
-        adds	x17, x17, x9
-        adc	x12, xzr, xzr
-        subs	x8, x2, x7
-        sbcs	x17, x3, x17
-        lsl	x7, x8, #32
-        sbcs	x2, x1, x12
-        add	x3, x7, x8
-        sbcs	x12, x4, xzr
-        lsr	x1, x3, #32
-        sbcs	x7, x15, xzr
-        sbc	x15, x9, xzr
-        subs	x1, x1, x3
-        sbc	x4, x3, xzr
-        lsr	x9, x4, #32
-        extr	x8, x4, x1, #32
-        adds	x9, x9, x3
-        adc	x4, xzr, xzr
-        subs	x1, x17, x8
-        lsl	x17, x1, #32
-        sbcs	x8, x2, x9
-        sbcs	x9, x12, x4
-        add	x17, x17, x1
-        mov	x1, v18.d[1]
-        lsr	x2, x17, #32
-        sbcs	x7, x7, xzr
-        mov	x12, v18.d[0]
-        sbcs	x15, x15, xzr
-        sbc	x3, x3, xzr
-        subs	x4, x2, x17
-        sbc	x2, x17, xzr
-        adds	x12, x13, x12
-        adcs	x16, x16, x1
-        lsr	x13, x2, #32
-        extr	x1, x2, x4, #32
-        adc	x2, x14, xzr
-        adds	x4, x13, x17
-        mul	x13, x6, x6
-        adc	x14, xzr, xzr
-        subs	x1, x8, x1
-        sbcs	x4, x9, x4
-        mov	x9, v28.d[0]
-        sbcs	x7, x7, x14
-        sbcs	x8, x15, xzr
-        sbcs	x3, x3, xzr
-        sbc	x14, x17, xzr
-        adds	x17, x9, x9
-        adcs	x12, x12, x12
-        mov	x15, v19.d[0]
-        adcs	x9, x16, x16
-        umulh	x6, x6, x6
-        adcs	x16, x2, x2
-        adc	x2, xzr, xzr
-        adds	x11, x11, x8
-        adcs	x3, x3, xzr
-        adcs	x14, x14, xzr
-        adcs	x8, xzr, xzr
-        adds	x13, x1, x13
-        mov	x1, v19.d[1]
-        adcs	x6, x4, x6
-        mov	x4, #0xffffffff            	// #4294967295
-        adcs	x15, x7, x15
-        adcs	x7, x11, x5
-        adcs	x1, x3, x1
-        adcs	x14, x14, x10
-        adc	x11, x8, xzr
-        adds	x6, x6, x17
-        adcs	x8, x15, x12
-        adcs	x3, x7, x9
-        adcs	x15, x1, x16
-        mov	x16, #0xffffffff00000001    	// #-4294967295
-        adcs	x14, x14, x2
-        mov	x2, #0x1                   	// #1
-        adc	x17, x11, xzr
-        cmn	x13, x16
-        adcs	xzr, x6, x4
-        adcs	xzr, x8, x2
-        adcs	xzr, x3, xzr
-        adcs	xzr, x15, xzr
-        adcs	xzr, x14, xzr
-        adc	x1, x17, xzr
-        neg	x9, x1
-        and	x1, x16, x9
-        adds	x11, x13, x1
-        and	x13, x4, x9
-        adcs	x5, x6, x13
-        and	x1, x2, x9
-        adcs	x7, x8, x1
-        stp	x11, x5, [sp, #192]
-        adcs	x11, x3, xzr
-        adcs	x2, x15, xzr
-        stp	x7, x11, [sp, #208]
-        adc	x17, x14, xzr
-        stp	x2, x17, [sp, #224]
-        ldp	x5, x6, [sp, #240]
-        ldp	x4, x3, [sp, #48]
-        subs	x5, x5, x4
-        sbcs	x6, x6, x3
-        ldp	x7, x8, [sp, #256]
-        ldp	x4, x3, [sp, #64]
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        ldp	x9, x10, [sp, #272]
-        ldp	x4, x3, [sp, #80]
-        sbcs	x9, x9, x4
-        sbcs	x10, x10, x3
-        csetm	x3, cc  // cc = lo, ul, last
-        mov	x4, #0xffffffff            	// #4294967295
-        and	x4, x4, x3
-        adds	x5, x5, x4
-        eor	x4, x4, x3
-        adcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe    	// #-2
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x5, x6, [x25, #96]
-        stp	x7, x8, [x25, #112]
-        stp	x9, x10, [x25, #128]
-        ldr	q3, [sp, #288]
-        ldr	q25, [sp, #96]
-        ldp	x13, x23, [sp, #96]
-        ldp	x3, x21, [sp, #288]
-        rev64	v23.4s, v25.4s
-        uzp1	v17.4s, v25.4s, v3.4s
-        umulh	x15, x3, x13
-        mul	v6.4s, v23.4s, v3.4s
-        uzp1	v3.4s, v3.4s, v3.4s
-        ldr	q27, [sp, #128]
-        ldp	x8, x24, [sp, #304]
-        subs	x6, x3, x21
-        ldr	q0, [sp, #320]
-        movi	v23.2d, #0xffffffff
-        csetm	x10, cc  // cc = lo, ul, last
-        umulh	x19, x21, x23
-        rev64	v4.4s, v27.4s
-        uzp2	v25.4s, v27.4s, v27.4s
-        cneg	x4, x6, cc  // cc = lo, ul, last
-        subs	x7, x23, x13
-        xtn	v22.2s, v0.2d
-        xtn	v24.2s, v27.2d
-        cneg	x20, x7, cc  // cc = lo, ul, last
-        ldp	x6, x14, [sp, #112]
-        mul	v27.4s, v4.4s, v0.4s
-        uaddlp	v20.2d, v6.4s
-        cinv	x5, x10, cc  // cc = lo, ul, last
-        mul	x16, x4, x20
-        uzp2	v6.4s, v0.4s, v0.4s
-        umull	v21.2d, v22.2s, v25.2s
-        shl	v0.2d, v20.2d, #32
-        umlal	v0.2d, v3.2s, v17.2s
-        mul	x22, x8, x6
-        umull	v1.2d, v6.2s, v25.2s
-        subs	x12, x3, x8
-        umull	v20.2d, v22.2s, v24.2s
-        cneg	x17, x12, cc  // cc = lo, ul, last
-        umulh	x9, x8, x6
-        mov	x12, v0.d[1]
-        eor	x11, x16, x5
-        mov	x7, v0.d[0]
-        csetm	x10, cc  // cc = lo, ul, last
-        usra	v21.2d, v20.2d, #32
-        adds	x15, x15, x12
-        adcs	x12, x19, x22
-        umulh	x20, x4, x20
-        adc	x19, x9, xzr
-        usra	v1.2d, v21.2d, #32
-        adds	x22, x15, x7
-        and	v26.16b, v21.16b, v23.16b
-        adcs	x16, x12, x15
-        uaddlp	v25.2d, v27.4s
-        adcs	x9, x19, x12
-        umlal	v26.2d, v6.2s, v24.2s
-        adc	x4, x19, xzr
-        adds	x16, x16, x7
-        shl	v27.2d, v25.2d, #32
-        adcs	x9, x9, x15
-        adcs	x4, x4, x12
-        eor	x12, x20, x5
-        adc	x15, x19, xzr
-        subs	x20, x6, x13
-        cneg	x20, x20, cc  // cc = lo, ul, last
-        cinv	x10, x10, cc  // cc = lo, ul, last
-        cmn	x5, #0x1
-        mul	x19, x17, x20
-        adcs	x11, x22, x11
-        adcs	x12, x16, x12
-        adcs	x9, x9, x5
-        umulh	x17, x17, x20
-        adcs	x22, x4, x5
-        adc	x5, x15, x5
-        subs	x16, x21, x8
-        cneg	x20, x16, cc  // cc = lo, ul, last
-        eor	x19, x19, x10
-        csetm	x4, cc  // cc = lo, ul, last
-        subs	x16, x6, x23
-        cneg	x16, x16, cc  // cc = lo, ul, last
-        umlal	v27.2d, v22.2s, v24.2s
-        mul	x15, x20, x16
-        cinv	x4, x4, cc  // cc = lo, ul, last
-        cmn	x10, #0x1
-        usra	v1.2d, v26.2d, #32
-        adcs	x19, x12, x19
-        eor	x17, x17, x10
-        adcs	x9, x9, x17
-        adcs	x22, x22, x10
-        lsl	x12, x7, #32
-        umulh	x20, x20, x16
-        eor	x16, x15, x4
-        ldp	x15, x17, [sp, #128]
-        add	x2, x12, x7
-        adc	x7, x5, x10
-        ldp	x5, x10, [sp, #320]
-        lsr	x1, x2, #32
-        eor	x12, x20, x4
-        subs	x1, x1, x2
-        sbc	x20, x2, xzr
-        cmn	x4, #0x1
-        adcs	x9, x9, x16
-        extr	x1, x20, x1, #32
-        lsr	x20, x20, #32
-        adcs	x22, x22, x12
-        adc	x16, x7, x4
-        adds	x12, x20, x2
-        umulh	x7, x24, x14
-        adc	x4, xzr, xzr
-        subs	x1, x11, x1
-        sbcs	x20, x19, x12
-        sbcs	x12, x9, x4
-        lsl	x9, x1, #32
-        add	x1, x9, x1
-        sbcs	x9, x22, xzr
-        mul	x22, x24, x14
-        sbcs	x16, x16, xzr
-        lsr	x4, x1, #32
-        sbc	x19, x2, xzr
-        subs	x4, x4, x1
-        sbc	x11, x1, xzr
-        extr	x2, x11, x4, #32
-        lsr	x4, x11, #32
-        adds	x4, x4, x1
-        adc	x11, xzr, xzr
-        subs	x2, x20, x2
-        sbcs	x4, x12, x4
-        sbcs	x20, x9, x11
-        lsl	x12, x2, #32
-        add	x2, x12, x2
-        sbcs	x9, x16, xzr
-        lsr	x11, x2, #32
-        sbcs	x19, x19, xzr
-        sbc	x1, x1, xzr
-        subs	x16, x11, x2
-        sbc	x12, x2, xzr
-        extr	x16, x12, x16, #32
-        lsr	x12, x12, #32
-        adds	x11, x12, x2
-        adc	x12, xzr, xzr
-        subs	x26, x4, x16
-        mov	x4, v27.d[0]
-        sbcs	x27, x20, x11
-        sbcs	x20, x9, x12
-        sbcs	x11, x19, xzr
-        sbcs	x9, x1, xzr
-        stp	x20, x11, [sp, #256]
-        mov	x1, v1.d[0]
-        sbc	x20, x2, xzr
-        subs	x12, x24, x5
-        mov	x11, v27.d[1]
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x2, cc  // cc = lo, ul, last
-        subs	x19, x15, x14
-        mov	x12, v1.d[1]
-        cinv	x2, x2, cc  // cc = lo, ul, last
-        cneg	x19, x19, cc  // cc = lo, ul, last
-        stp	x9, x20, [sp, #272]
-        mul	x9, x16, x19
-        adds	x4, x7, x4
-        adcs	x11, x1, x11
-        adc	x1, x12, xzr
-        adds	x20, x4, x22
-        umulh	x19, x16, x19
-        adcs	x7, x11, x4
-        eor	x16, x9, x2
-        adcs	x9, x1, x11
-        adc	x12, x1, xzr
-        adds	x7, x7, x22
-        adcs	x4, x9, x4
-        adcs	x9, x12, x11
-        adc	x12, x1, xzr
-        cmn	x2, #0x1
-        eor	x1, x19, x2
-        adcs	x11, x20, x16
-        adcs	x19, x7, x1
-        adcs	x1, x4, x2
-        adcs	x20, x9, x2
-        adc	x2, x12, x2
-        subs	x12, x24, x10
-        cneg	x16, x12, cc  // cc = lo, ul, last
-        csetm	x12, cc  // cc = lo, ul, last
-        subs	x9, x17, x14
-        cinv	x12, x12, cc  // cc = lo, ul, last
-        cneg	x9, x9, cc  // cc = lo, ul, last
-        subs	x3, x24, x3
-        sbcs	x21, x5, x21
-        mul	x24, x16, x9
-        sbcs	x4, x10, x8
-        ngc	x8, xzr
-        subs	x10, x5, x10
-        eor	x5, x24, x12
-        csetm	x7, cc  // cc = lo, ul, last
-        cneg	x24, x10, cc  // cc = lo, ul, last
-        subs	x10, x17, x15
-        cinv	x7, x7, cc  // cc = lo, ul, last
-        cneg	x10, x10, cc  // cc = lo, ul, last
-        subs	x14, x13, x14
-        sbcs	x15, x23, x15
-        eor	x13, x21, x8
-        mul	x23, x24, x10
-        sbcs	x17, x6, x17
-        eor	x6, x3, x8
-        ngc	x21, xzr
-        umulh	x9, x16, x9
-        cmn	x8, #0x1
-        eor	x3, x23, x7
-        adcs	x23, x6, xzr
-        adcs	x13, x13, xzr
-        eor	x16, x4, x8
-        adc	x16, x16, xzr
-        eor	x4, x17, x21
-        umulh	x17, x24, x10
-        cmn	x21, #0x1
-        eor	x24, x14, x21
-        eor	x6, x15, x21
-        adcs	x15, x24, xzr
-        adcs	x14, x6, xzr
-        adc	x6, x4, xzr
-        cmn	x12, #0x1
-        eor	x4, x9, x12
-        adcs	x19, x19, x5
-        umulh	x5, x23, x15
-        adcs	x1, x1, x4
-        adcs	x10, x20, x12
-        eor	x4, x17, x7
-        adc	x2, x2, x12
-        cmn	x7, #0x1
-        adcs	x12, x1, x3
-        ldp	x17, x24, [sp, #256]
-        mul	x1, x16, x6
-        adcs	x3, x10, x4
-        adc	x2, x2, x7
-        ldp	x7, x4, [sp, #272]
-        adds	x20, x22, x26
-        mul	x10, x13, x14
-        adcs	x11, x11, x27
-        eor	x9, x8, x21
-        adcs	x26, x19, x17
-        stp	x20, x11, [sp, #240]
-        adcs	x27, x12, x24
-        mul	x8, x23, x15
-        adcs	x3, x3, x7
-        adcs	x12, x2, x4
-        adc	x19, xzr, xzr
-        subs	x21, x23, x16
-        umulh	x2, x16, x6
-        stp	x3, x12, [sp, #272]
-        cneg	x3, x21, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        umulh	x11, x13, x14
-        subs	x21, x13, x16
-        eor	x7, x8, x9
-        cneg	x17, x21, cc  // cc = lo, ul, last
-        csetm	x16, cc  // cc = lo, ul, last
-        subs	x21, x6, x15
-        cneg	x22, x21, cc  // cc = lo, ul, last
-        cinv	x21, x24, cc  // cc = lo, ul, last
-        subs	x20, x23, x13
-        umulh	x12, x3, x22
-        cneg	x23, x20, cc  // cc = lo, ul, last
-        csetm	x24, cc  // cc = lo, ul, last
-        subs	x20, x14, x15
-        cinv	x24, x24, cc  // cc = lo, ul, last
-        mul	x22, x3, x22
-        cneg	x3, x20, cc  // cc = lo, ul, last
-        subs	x13, x6, x14
-        cneg	x20, x13, cc  // cc = lo, ul, last
-        cinv	x15, x16, cc  // cc = lo, ul, last
-        adds	x13, x5, x10
-        mul	x4, x23, x3
-        adcs	x11, x11, x1
-        adc	x14, x2, xzr
-        adds	x5, x13, x8
-        adcs	x16, x11, x13
-        umulh	x23, x23, x3
-        adcs	x3, x14, x11
-        adc	x1, x14, xzr
-        adds	x10, x16, x8
-        adcs	x6, x3, x13
-        adcs	x8, x1, x11
-        umulh	x13, x17, x20
-        eor	x1, x4, x24
-        adc	x4, x14, xzr
-        cmn	x24, #0x1
-        adcs	x1, x5, x1
-        eor	x16, x23, x24
-        eor	x11, x1, x9
-        adcs	x23, x10, x16
-        eor	x2, x22, x21
-        adcs	x3, x6, x24
-        mul	x14, x17, x20
-        eor	x17, x13, x15
-        adcs	x13, x8, x24
-        adc	x8, x4, x24
-        cmn	x21, #0x1
-        adcs	x6, x23, x2
-        mov	x16, #0xfffffffffffffffe    	// #-2
-        eor	x20, x12, x21
-        adcs	x20, x3, x20
-        eor	x23, x14, x15
-        adcs	x2, x13, x21
-        adc	x8, x8, x21
-        cmn	x15, #0x1
-        ldp	x5, x4, [sp, #240]
-        adcs	x22, x20, x23
-        eor	x23, x22, x9
-        adcs	x17, x2, x17
-        adc	x22, x8, x15
-        cmn	x9, #0x1
-        adcs	x15, x7, x5
-        ldp	x10, x14, [sp, #272]
-        eor	x1, x6, x9
-        lsl	x2, x15, #32
-        adcs	x8, x11, x4
-        adcs	x13, x1, x26
-        eor	x1, x22, x9
-        adcs	x24, x23, x27
-        eor	x11, x17, x9
-        adcs	x23, x11, x10
-        adcs	x7, x1, x14
-        adcs	x17, x9, x19
-        adcs	x20, x9, xzr
-        add	x1, x2, x15
-        lsr	x3, x1, #32
-        adcs	x11, x9, xzr
-        adc	x9, x9, xzr
-        subs	x3, x3, x1
-        sbc	x6, x1, xzr
-        adds	x24, x24, x5
-        adcs	x4, x23, x4
-        extr	x3, x6, x3, #32
-        lsr	x6, x6, #32
-        adcs	x21, x7, x26
-        adcs	x15, x17, x27
-        adcs	x7, x20, x10
-        adcs	x20, x11, x14
-        mov	x14, #0xffffffff            	// #4294967295
-        adc	x22, x9, x19
-        adds	x12, x6, x1
-        adc	x10, xzr, xzr
-        subs	x3, x8, x3
-        sbcs	x12, x13, x12
-        lsl	x9, x3, #32
-        add	x3, x9, x3
-        sbcs	x10, x24, x10
-        sbcs	x24, x4, xzr
-        lsr	x9, x3, #32
-        sbcs	x21, x21, xzr
-        sbc	x1, x1, xzr
-        subs	x9, x9, x3
-        sbc	x13, x3, xzr
-        extr	x9, x13, x9, #32
-        lsr	x13, x13, #32
-        adds	x13, x13, x3
-        adc	x6, xzr, xzr
-        subs	x12, x12, x9
-        sbcs	x17, x10, x13
-        lsl	x2, x12, #32
-        sbcs	x10, x24, x6
-        add	x9, x2, x12
-        sbcs	x6, x21, xzr
-        lsr	x5, x9, #32
-        sbcs	x21, x1, xzr
-        sbc	x13, x3, xzr
-        subs	x8, x5, x9
-        sbc	x19, x9, xzr
-        lsr	x12, x19, #32
-        extr	x3, x19, x8, #32
-        adds	x8, x12, x9
-        adc	x1, xzr, xzr
-        subs	x2, x17, x3
-        sbcs	x12, x10, x8
-        sbcs	x5, x6, x1
-        sbcs	x3, x21, xzr
-        sbcs	x19, x13, xzr
-        sbc	x24, x9, xzr
-        adds	x23, x15, x3
-        adcs	x8, x7, x19
-        adcs	x11, x20, x24
-        adc	x9, x22, xzr
-        add	x24, x9, #0x1
-        lsl	x7, x24, #32
-        subs	x21, x24, x7
-        sbc	x10, x7, xzr
-        adds	x6, x2, x21
-        adcs	x7, x12, x10
-        adcs	x24, x5, x24
-        adcs	x13, x23, xzr
-        adcs	x8, x8, xzr
-        adcs	x15, x11, xzr
-        csetm	x23, cc  // cc = lo, ul, last
-        and	x11, x16, x23
-        and	x20, x14, x23
-        adds	x22, x6, x20
-        eor	x3, x20, x23
-        adcs	x5, x7, x3
-        adcs	x14, x24, x11
-        stp	x22, x5, [sp, #240]
-        adcs	x5, x13, x23
-        adcs	x12, x8, x23
-        stp	x14, x5, [sp, #256]
-        adc	x19, x15, x23
-        ldp	x1, x2, [sp, #144]
-        ldp	x3, x4, [sp, #160]
-        ldp	x5, x6, [sp, #176]
-        lsl	x0, x1, #2
-        ldp	x7, x8, [sp, #288]
-        subs	x0, x0, x7
-        extr	x1, x2, x1, #62
-        sbcs	x1, x1, x8
-        ldp	x7, x8, [sp, #304]
-        extr	x2, x3, x2, #62
-        sbcs	x2, x2, x7
-        extr	x3, x4, x3, #62
-        sbcs	x3, x3, x8
-        extr	x4, x5, x4, #62
-        ldp	x7, x8, [sp, #320]
-        sbcs	x4, x4, x7
-        extr	x5, x6, x5, #62
-        sbcs	x5, x5, x8
-        lsr	x6, x6, #62
-        adc	x6, x6, xzr
-        lsl	x7, x6, #32
-        subs	x8, x6, x7
-        sbc	x7, x7, xzr
-        adds	x0, x0, x8
-        adcs	x1, x1, x7
-        adcs	x2, x2, x6
-        adcs	x3, x3, xzr
-        adcs	x4, x4, xzr
-        adcs	x5, x5, xzr
-        csetm	x8, cc  // cc = lo, ul, last
-        mov	x9, #0xffffffff            	// #4294967295
-        and	x9, x9, x8
-        adds	x0, x0, x9
-        eor	x9, x9, x8
-        adcs	x1, x1, x9
-        mov	x9, #0xfffffffffffffffe    	// #-2
-        and	x9, x9, x8
-        adcs	x2, x2, x9
-        adcs	x3, x3, x8
-        adcs	x4, x4, x8
-        adc	x5, x5, x8
-        stp	x0, x1, [x25]
-        stp	x2, x3, [x25, #16]
-        stp	x4, x5, [x25, #32]
-        ldp	x0, x1, [sp, #192]
-        mov	x6, #0xffffffff            	// #4294967295
-        subs	x6, x6, x0
-        mov	x7, #0xffffffff00000000    	// #-4294967296
-        sbcs	x7, x7, x1
-        ldp	x0, x1, [sp, #208]
-        mov	x8, #0xfffffffffffffffe    	// #-2
-        sbcs	x8, x8, x0
-        mov	x13, #0xffffffffffffffff    	// #-1
-        sbcs	x9, x13, x1
-        ldp	x0, x1, [sp, #224]
-        sbcs	x10, x13, x0
-        sbc	x11, x13, x1
-        lsl	x0, x6, #3
-        extr	x1, x7, x6, #61
-        extr	x2, x8, x7, #61
-        extr	x3, x9, x8, #61
-        extr	x4, x10, x9, #61
-        extr	x5, x11, x10, #61
-        lsr	x6, x11, #61
-        add	x6, x6, #0x1
-        ldp	x8, x9, [sp, #240]
-        ldp	x10, x11, [sp, #256]
-        mov	x14, #0x3                   	// #3
-        mul	x15, x14, x8
-        umulh	x8, x14, x8
-        adds	x0, x0, x15
-        mul	x15, x14, x9
-        umulh	x9, x14, x9
-        adcs	x1, x1, x15
-        mul	x15, x14, x10
-        umulh	x10, x14, x10
-        adcs	x2, x2, x15
-        mul	x15, x14, x11
-        umulh	x11, x14, x11
-        adcs	x3, x3, x15
-        mul	x15, x14, x12
-        umulh	x12, x14, x12
-        adcs	x4, x4, x15
-        mul	x15, x14, x19
-        umulh	x13, x14, x19
-        adcs	x5, x5, x15
-        adc	x6, x6, xzr
-        adds	x1, x1, x8
-        adcs	x2, x2, x9
-        adcs	x3, x3, x10
-        adcs	x4, x4, x11
-        adcs	x5, x5, x12
-        adcs	x6, x6, x13
-        lsl	x7, x6, #32
-        subs	x8, x6, x7
-        sbc	x7, x7, xzr
-        adds	x0, x0, x8
-        adcs	x1, x1, x7
-        adcs	x2, x2, x6
-        adcs	x3, x3, xzr
-        adcs	x4, x4, xzr
-        adcs	x5, x5, xzr
-        csetm	x6, cc  // cc = lo, ul, last
-        mov	x7, #0xffffffff            	// #4294967295
-        and	x7, x7, x6
-        adds	x0, x0, x7
-        eor	x7, x7, x6
-        adcs	x1, x1, x7
-        mov	x7, #0xfffffffffffffffe    	// #-2
-        and	x7, x7, x6
-        adcs	x2, x2, x7
-        adcs	x3, x3, x6
-        adcs	x4, x4, x6
-        adc	x5, x5, x6
-        stp	x0, x1, [x25, #48]
-        stp	x2, x3, [x25, #64]
-        stp	x4, x5, [x25, #80]
-        ldp	x19, x20, [sp, #336]
-        ldp	x21, x22, [sp, #352]
-        ldp	x23, x24, [sp, #368]
-        ldp	x25, x26, [sp, #384]
-        ldp	x27, xzr, [sp, #400]
-        add	sp, sp, #0x1a0
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p521/bignum_inv_p521.S b/third_party/s2n-bignum/arm/p521/bignum_inv_p521.S
deleted file mode 100644
index 7db741647d6..00000000000
--- a/third_party/s2n-bignum/arm/p521/bignum_inv_p521.S
+++ /dev/null
@@ -1,1696 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Modular inverse modulo p_521 =  2^521 - 1
-// Input x[9]; output z[9]
-//
-// extern void bignum_inv_p521(uint64_t z[static 9],uint64_t x[static 9]);
-//
-// Assuming the 9-digit input x is coprime to p_521, i.e. is not divisible
-// by it, returns z < p_521 such that x * z == 1 (mod p_521). Note that
-// x does not need to be reduced modulo p_521, but the output always is.
-//
-// Standard ARM ABI: X0 = z, X1 = x
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p521)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p521)
-
-        .text
-        .balign 4
-
-// Size in bytes of a 64-bit word
-
-#define N 8
-
-// Used for the return pointer
-
-#define res x20
-
-// Loop counter and d = 2 * delta value for divstep
-
-#define i x21
-#define d x22
-
-// Registers used for matrix element magnitudes and signs
-
-#define m00 x10
-#define m01 x11
-#define m10 x12
-#define m11 x13
-#define s00 x14
-#define s01 x15
-#define s10 x16
-#define s11 x17
-
-// Initial carries for combinations
-
-#define car0 x9
-#define car1 x19
-
-// Input and output, plain registers treated according to pattern
-
-#define reg0 x0, #0
-#define reg1 x1, #0
-#define reg2 x2, #0
-#define reg3 x3, #0
-#define reg4 x4, #0
-
-#define x x1, #0
-#define z x0, #0
-
-// Pointer-offset pairs for temporaries on stack
-
-#define f sp, #0
-#define g sp, #(9*N)
-#define u sp, #(18*N)
-#define v sp, #(27*N)
-
-// Total size to reserve on the stack
-
-#define NSPACE #(36*N)
-
-// Very similar to a subroutine call to the s2n-bignum word_divstep59.
-// But different in register usage and returning the final matrix in
-// registers as follows
-//
-// [ m00  m01]
-// [ m10  m11]
-
-#define divstep59()                                                     \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x8, x4, #0x100, lsl #12;                                \
-        sbfx    x8, x8, #21, #21;                                       \
-        mov     x11, #0x100000;                                         \
-        add     x11, x11, x11, lsl #21;                                 \
-        add     x9, x4, x11;                                            \
-        asr     x9, x9, #42;                                            \
-        add     x10, x5, #0x100, lsl #12;                               \
-        sbfx    x10, x10, #21, #21;                                     \
-        add     x11, x5, x11;                                           \
-        asr     x11, x11, #42;                                          \
-        mul     x6, x8, x2;                                             \
-        mul     x7, x9, x3;                                             \
-        mul     x2, x10, x2;                                            \
-        mul     x3, x11, x3;                                            \
-        add     x4, x6, x7;                                             \
-        add     x5, x2, x3;                                             \
-        asr     x2, x4, #20;                                            \
-        asr     x3, x5, #20;                                            \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x12, x4, #0x100, lsl #12;                               \
-        sbfx    x12, x12, #21, #21;                                     \
-        mov     x15, #0x100000;                                         \
-        add     x15, x15, x15, lsl #21;                                 \
-        add     x13, x4, x15;                                           \
-        asr     x13, x13, #42;                                          \
-        add     x14, x5, #0x100, lsl #12;                               \
-        sbfx    x14, x14, #21, #21;                                     \
-        add     x15, x5, x15;                                           \
-        asr     x15, x15, #42;                                          \
-        mul     x6, x12, x2;                                            \
-        mul     x7, x13, x3;                                            \
-        mul     x2, x14, x2;                                            \
-        mul     x3, x15, x3;                                            \
-        add     x4, x6, x7;                                             \
-        add     x5, x2, x3;                                             \
-        asr     x2, x4, #20;                                            \
-        asr     x3, x5, #20;                                            \
-        and     x4, x2, #0xfffff;                                       \
-        orr     x4, x4, #0xfffffe0000000000;                            \
-        and     x5, x3, #0xfffff;                                       \
-        orr     x5, x5, #0xc000000000000000;                            \
-        tst     x5, #0x1;                                               \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        mul     x2, x12, x8;                                            \
-        mul     x3, x12, x9;                                            \
-        mul     x6, x14, x8;                                            \
-        mul     x7, x14, x9;                                            \
-        madd    x8, x13, x10, x2;                                       \
-        madd    x9, x13, x11, x3;                                       \
-        madd    x16, x15, x10, x6;                                      \
-        madd    x17, x15, x11, x7;                                      \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        tst     x5, #0x2;                                               \
-        asr     x5, x5, #1;                                             \
-        csel    x6, x4, xzr, ne;                                        \
-        ccmp    x1, xzr, #0x8, ne;                                      \
-        cneg    x1, x1, ge;                                             \
-        cneg    x6, x6, ge;                                             \
-        csel    x4, x5, x4, ge;                                         \
-        add     x5, x5, x6;                                             \
-        add     x1, x1, #0x2;                                           \
-        asr     x5, x5, #1;                                             \
-        add     x12, x4, #0x100, lsl #12;                               \
-        sbfx    x12, x12, #22, #21;                                     \
-        mov     x15, #0x100000;                                         \
-        add     x15, x15, x15, lsl #21;                                 \
-        add     x13, x4, x15;                                           \
-        asr     x13, x13, #43;                                          \
-        add     x14, x5, #0x100, lsl #12;                               \
-        sbfx    x14, x14, #22, #21;                                     \
-        add     x15, x5, x15;                                           \
-        asr     x15, x15, #43;                                          \
-        mneg    x2, x12, x8;                                            \
-        mneg    x3, x12, x9;                                            \
-        mneg    x4, x14, x8;                                            \
-        mneg    x5, x14, x9;                                            \
-        msub    m00, x13, x16, x2;                                      \
-        msub    m01, x13, x17, x3;                                      \
-        msub    m10, x15, x16, x4;                                      \
-        msub    m11, x15, x17, x5
-
-// Loading large constants
-
-#define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
-        movk    nn, n3, lsl #48
-
-S2N_BN_SYMBOL(bignum_inv_p521):
-
-// Save registers and make room for temporaries
-
-        stp     x19, x20, [sp, -16]!
-        stp     x21, x22, [sp, -16]!
-        sub     sp, sp, NSPACE
-
-// Save the return pointer for the end so we can overwrite x0 later
-
-        mov     res, x0
-
-// Copy the prime p_521 = 2^521 - 1 into the f variable
-
-        mov     x10, #0xFFFFFFFFFFFFFFFF
-        stp     x10, x10, [f]
-        stp     x10, x10, [f+16]
-        stp     x10, x10, [f+32]
-        stp     x10, x10, [f+48]
-        mov     x11, #0x1FF
-        str     x11, [f+64]
-
-// Copy the input into the g variable, but reduce it strictly mod p_521
-// so that g <= f as assumed in the bound proof. This code fragment is
-// very similar to bignum_mod_p521_9 complete with carry condensation.
-
-        ldr     x8, [x1, #64]
-        lsr     x9, x8, #9
-
-        subs    xzr, xzr, xzr
-        ldp     x10, x11, [x1]
-        adcs    xzr, x10, x9
-        adcs    xzr, x11, xzr
-        ldp     x12, x13, [x1, #16]
-        and     x7, x12, x13
-        adcs    xzr, x7, xzr
-        ldp     x14, x15, [x1, #32]
-        and     x7, x14, x15
-        adcs    xzr, x7, xzr
-        ldp     x16, x17, [x1, #48]
-        and     x7, x16, x17
-        adcs    xzr, x7, xzr
-        orr     x7, x8, #~0x1FF
-        adcs    x7, x7, xzr
-
-        adcs    x10, x10, x9
-        adcs    x11, x11, xzr
-        adcs    x12, x12, xzr
-        adcs    x13, x13, xzr
-        adcs    x14, x14, xzr
-        adcs    x15, x15, xzr
-        adcs    x16, x16, xzr
-        adcs    x17, x17, xzr
-        adc     x8, x8, xzr
-        and     x8, x8, #0x1FF
-
-        stp     x10, x11, [g]
-        stp     x12, x13, [g+16]
-        stp     x14, x15, [g+32]
-        stp     x16, x17, [g+48]
-        str     x8, [g+64]
-
-// Also maintain weakly reduced < 2*p_521 vector [u,v] such that
-// [f,g] == x * 2^{1239-59*i} * [u,v] (mod p_521)
-// starting with [p_521,x] == x * 2^{1239-59*0} * [0,2^-1239] (mod p_521)
-// Note that because (2^{a+521} == 2^a) (mod p_521) we simply have
-// (2^-1239 == 2^324) (mod p_521) so the constant initializer is simple.
-//
-// Based on the standard divstep bound, for inputs <= 2^b we need at least
-// n >= (9437 * b + 1) / 4096. Since b is 521, that means 1201 iterations.
-// Since we package divstep in multiples of 59 bits, we do 21 blocks of 59
-// making *1239* total. (With a bit more effort we could avoid the full 59
-// divsteps and use a shorter tail computation, but we keep it simple.)
-// Hence, after the 21st iteration we have [f,g] == x * [u,v] and since
-// |f| = 1 we get the modular inverse from u by flipping its sign with f.
-
-        stp     xzr, xzr, [u]
-        stp     xzr, xzr, [u+16]
-        stp     xzr, xzr, [u+32]
-        stp     xzr, xzr, [u+48]
-        str     xzr, [u+64]
-
-        mov     x10, #16
-        stp     xzr, xzr, [v]
-        stp     xzr, xzr, [v+16]
-        stp     xzr, x10, [v+32]
-        stp     xzr, xzr, [v+48]
-        str     xzr, [v+64]
-
-// Start of main loop. We jump into the middle so that the divstep
-// portion is common to the special 21st iteration after a uniform
-// first 20.
-
-        mov     i, #21
-        mov     d, #1
-        b       bignum_inv_p521_midloop
-
-bignum_inv_p521_loop:
-
-// Separate the matrix elements into sign-magnitude pairs
-
-        cmp     m00, xzr
-        csetm   s00, mi
-        cneg    m00, m00, mi
-
-        cmp     m01, xzr
-        csetm   s01, mi
-        cneg    m01, m01, mi
-
-        cmp     m10, xzr
-        csetm   s10, mi
-        cneg    m10, m10, mi
-
-        cmp     m11, xzr
-        csetm   s11, mi
-        cneg    m11, m11, mi
-
-// Adjust the initial values to allow for complement instead of negation
-// This initial offset is the same for [f,g] and [u,v] compositions.
-// Save it in stable registers for the [u,v] part and do [f,g] first.
-
-        and     x0, m00, s00
-        and     x1, m01, s01
-        add     car0, x0, x1
-
-        and     x0, m10, s10
-        and     x1, m11, s11
-        add     car1, x0, x1
-
-// Now the computation of the updated f and g values. This maintains a
-// 2-word carry between stages so we can conveniently insert the shift
-// right by 59 before storing back, and not overwrite digits we need
-// again of the old f and g values.
-//
-// Digit 0 of [f,g]
-
-        ldr     x7, [f]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [g]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, car1, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-
-// Digit 1 of [f,g]
-
-        ldr     x7, [f+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [g+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        adc     x6, x6, x1
-        extr    x4, x2, x4, #59
-        str     x4, [f]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        adc     x4, x4, x1
-        extr    x5, x3, x5, #59
-        str     x5, [g]
-
-// Digit 2 of [f,g]
-
-        ldr     x7, [f+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [g+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        adc     x5, x5, x1
-        extr    x2, x6, x2, #59
-        str     x2, [f+N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        extr    x3, x4, x3, #59
-        str     x3, [g+N]
-
-// Digit 3 of [f,g]
-
-        ldr     x7, [f+3*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        ldr     x8, [g+3*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        extr    x6, x5, x6, #59
-        str     x6, [f+2*N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        adc     x6, x6, x1
-        extr    x4, x2, x4, #59
-        str     x4, [g+2*N]
-
-// Digit 4 of [f,g]
-
-        ldr     x7, [f+4*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        ldr     x8, [g+4*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x3, x3, x0
-        adc     x4, x4, x1
-        extr    x5, x3, x5, #59
-        str     x5, [f+3*N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x6, x6, x0
-        adc     x5, x5, x1
-        extr    x2, x6, x2, #59
-        str     x2, [g+3*N]
-
-// Digit 5 of [f,g]
-
-        ldr     x7, [f+5*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        ldr     x8, [g+5*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        extr    x3, x4, x3, #59
-        str     x3, [f+4*N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        extr    x6, x5, x6, #59
-        str     x6, [g+4*N]
-
-// Digit 6 of [f,g]
-
-        ldr     x7, [f+6*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [g+6*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        adc     x6, x6, x1
-        extr    x4, x2, x4, #59
-        str     x4, [f+5*N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        adc     x4, x4, x1
-        extr    x5, x3, x5, #59
-        str     x5, [g+5*N]
-
-// Digit 7 of [f,g]
-
-        ldr     x7, [f+7*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [g+7*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        adc     x5, x5, x1
-        extr    x2, x6, x2, #59
-        str     x2, [f+6*N]
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        adc     x2, x2, x1
-        extr    x3, x4, x3, #59
-        str     x3, [g+6*N]
-
-// Digits 8 and 9 of [f,g]
-
-        ldr     x7, [f+8*N]
-        eor     x1, x7, s00
-        asr     x3, x1, #63
-        and     x3, x3, m00
-        neg     x3, x3
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        ldr     x8, [g+8*N]
-        eor     x1, x8, s01
-        asr     x0, x1, #63
-        and     x0, x0, m01
-        sub     x3, x3, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        extr    x6, x5, x6, #59
-        str     x6, [f+7*N]
-        extr    x5, x3, x5, #59
-        str     x5, [f+8*N]
-
-        eor     x1, x7, s10
-        asr     x5, x1, #63
-        and     x5, x5, m10
-        neg     x5, x5
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x5, x5, x1
-        eor     x1, x8, s11
-        asr     x0, x1, #63
-        and     x0, x0, m11
-        sub     x5, x5, x0
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        adc     x5, x5, x1
-        extr    x4, x2, x4, #59
-        str     x4, [g+7*N]
-        extr    x2, x5, x2, #59
-        str     x2, [g+8*N]
-
-// Now the computation of the updated u and v values and their
-// modular reductions. A very similar accumulation except that
-// the top words of u and v are unsigned and we don't shift.
-//
-// Digit 0 of [u,v]
-
-        ldr     x7, [u]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u]
-        adc     x2, x2, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, car1, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        str     x5, [v]
-        adc     x3, x3, x1
-
-// Digit 1 of [u,v]
-
-        ldr     x7, [u+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+N]
-        adc     x6, x6, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        str     x3, [v+N]
-        adc     x4, x4, x1
-
-// Digit 2 of [u,v]
-
-        ldr     x7, [u+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+2*N]
-        adc     x5, x5, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        str     x4, [v+2*N]
-        adc     x2, x2, x1
-
-// Digit 3 of [u,v]
-
-        ldr     x7, [u+3*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        ldr     x8, [v+3*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        str     x5, [u+3*N]
-        adc     x3, x3, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        str     x2, [v+3*N]
-        adc     x6, x6, x1
-
-// Digit 4 of [u,v]
-
-        ldr     x7, [u+4*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        ldr     x8, [v+4*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x3, x3, x0
-        str     x3, [u+4*N]
-        adc     x4, x4, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x6, x6, x0
-        str     x6, [v+4*N]
-        adc     x5, x5, x1
-
-// Digit 5 of [u,v]
-
-        ldr     x7, [u+5*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v+5*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u+5*N]
-        adc     x2, x2, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x5, x5, x0
-        str     x5, [v+5*N]
-        adc     x3, x3, x1
-
-// Digit 6 of [u,v]
-
-        ldr     x7, [u+6*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+6*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+6*N]
-        adc     x6, x6, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x3, x3, x0
-        str     x3, [v+6*N]
-        adc     x4, x4, x1
-
-// Digit 7 of [u,v]
-
-        ldr     x7, [u+7*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+7*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+7*N]
-        adc     x5, x5, x1
-
-        eor     x1, x7, s10
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        eor     x1, x8, s11
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x4, x4, x0
-        str     x4, [v+7*N]
-        adc     x2, x2, x1
-
-// Digits 8 and 9 of u (top is unsigned)
-
-        ldr     x7, [u+8*N]
-        eor     x1, x7, s00
-        and     x3, s00, m00
-        neg     x3, x3
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        ldr     x8, [v+8*N]
-        eor     x1, x8, s01
-        and     x0, s01, m01
-        sub     x3, x3, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-
-// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3
-
-        extr    x6, x3, x5, #9
-        ldp     x0, x1, [u]
-        add     x6, x6, x3, asr #63
-        sub     x5, x5, x6, lsl #9
-        adds    x0, x0, x6
-        asr     x6, x6, #63
-        adcs    x1, x1, x6
-        stp     x0, x1, [u]
-        ldp     x0, x1, [u+16]
-        adcs    x0, x0, x6
-        adcs    x1, x1, x6
-        stp     x0, x1, [u+16]
-        ldp     x0, x1, [u+32]
-        adcs    x0, x0, x6
-        adcs    x1, x1, x6
-        stp     x0, x1, [u+32]
-        ldp     x0, x1, [u+48]
-        adcs    x0, x0, x6
-        adcs    x1, x1, x6
-        stp     x0, x1, [u+48]
-        adc     x5, x5, x6
-        str     x5, [u+64]
-
-// Digits 8 and 9 of v (top is unsigned)
-
-        eor     x1, x7, s10
-        and     x5, s10, m10
-        neg     x5, x5
-        mul     x0, x1, m10
-        umulh   x1, x1, m10
-        adds    x2, x2, x0
-        adc     x5, x5, x1
-        eor     x1, x8, s11
-        and     x0, s11, m11
-        sub     x5, x5, x0
-        mul     x0, x1, m11
-        umulh   x1, x1, m11
-        adds    x2, x2, x0
-        adc     x5, x5, x1
-
-// Modular reduction of v, reloading as needed from v[0],...,v[7],x2,x5
-
-        extr    x6, x5, x2, #9
-        ldp     x0, x1, [v]
-        add     x6, x6, x5, asr #63
-        sub     x2, x2, x6, lsl #9
-        adds    x0, x0, x6
-        asr     x6, x6, #63
-        adcs    x1, x1, x6
-        stp     x0, x1, [v]
-        ldp     x0, x1, [v+16]
-        adcs    x0, x0, x6
-        adcs    x1, x1, x6
-        stp     x0, x1, [v+16]
-        ldp     x0, x1, [v+32]
-        adcs    x0, x0, x6
-        adcs    x1, x1, x6
-        stp     x0, x1, [v+32]
-        ldp     x0, x1, [v+48]
-        adcs    x0, x0, x6
-        adcs    x1, x1, x6
-        stp     x0, x1, [v+48]
-        adc     x2, x2, x6
-        str     x2, [v+64]
-
-bignum_inv_p521_midloop:
-
-        mov     x1, d
-        ldr     x2, [f]
-        ldr     x3, [g]
-        divstep59()
-        mov     d, x1
-
-// Next iteration
-
-        subs    i, i, #1
-        bne     bignum_inv_p521_loop
-
-// The 21st and last iteration does not need anything except the
-// u value and the sign of f; the latter can be obtained from the
-// lowest word of f. So it's done differently from the main loop.
-// Find the sign of the new f. For this we just need one digit
-// since we know (for in-scope cases) that f is either +1 or -1.
-// We don't explicitly shift right by 59 either, but looking at
-// bit 63 (or any bit >= 60) of the unshifted result is enough
-// to distinguish -1 from +1; this is then made into a mask.
-
-        ldr     x0, [f]
-        ldr     x1, [g]
-        mul     x0, x0, m00
-        madd    x1, x1, m01, x0
-        asr     x0, x1, #63
-
-// Now separate out the matrix into sign-magnitude pairs
-// and adjust each one based on the sign of f.
-//
-// Note that at this point we expect |f|=1 and we got its
-// sign above, so then since [f,0] == x * [u,v] (mod p_521)
-// we want to flip the sign of u according to that of f.
-
-        cmp     m00, xzr
-        csetm   s00, mi
-        cneg    m00, m00, mi
-        eor     s00, s00, x0
-
-        cmp     m01, xzr
-        csetm   s01, mi
-        cneg    m01, m01, mi
-        eor     s01, s01, x0
-
-        cmp     m10, xzr
-        csetm   s10, mi
-        cneg    m10, m10, mi
-        eor     s10, s10, x0
-
-        cmp     m11, xzr
-        csetm   s11, mi
-        cneg    m11, m11, mi
-        eor     s11, s11, x0
-
-// Adjust the initial value to allow for complement instead of negation
-
-        and     x0, m00, s00
-        and     x1, m01, s01
-        add     car0, x0, x1
-
-// Digit 0 of [u]
-
-        ldr     x7, [u]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, car0, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u]
-        adc     x2, x2, x1
-
-// Digit 1 of [u]
-
-        ldr     x7, [u+N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+N]
-        adc     x6, x6, x1
-
-// Digit 2 of [u]
-
-        ldr     x7, [u+2*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+2*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+2*N]
-        adc     x5, x5, x1
-
-// Digit 3 of [u]
-
-        ldr     x7, [u+3*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, xzr, x1
-        ldr     x8, [v+3*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        str     x5, [u+3*N]
-        adc     x3, x3, x1
-
-// Digit 4 of [u]
-
-        ldr     x7, [u+4*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x3, x3, x0
-        adc     x4, xzr, x1
-        ldr     x8, [v+4*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x3, x3, x0
-        str     x3, [u+4*N]
-        adc     x4, x4, x1
-
-// Digit 5 of [u]
-
-        ldr     x7, [u+5*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x4, x4, x0
-        adc     x2, xzr, x1
-        ldr     x8, [v+5*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x4, x4, x0
-        str     x4, [u+5*N]
-        adc     x2, x2, x1
-
-// Digit 6 of [u]
-
-        ldr     x7, [u+6*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x2, x2, x0
-        adc     x6, xzr, x1
-        ldr     x8, [v+6*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x2, x2, x0
-        str     x2, [u+6*N]
-        adc     x6, x6, x1
-
-// Digit 7 of [u]
-
-        ldr     x7, [u+7*N]
-        eor     x1, x7, s00
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x6, x6, x0
-        adc     x5, xzr, x1
-        ldr     x8, [v+7*N]
-        eor     x1, x8, s01
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x6, x6, x0
-        str     x6, [u+7*N]
-        adc     x5, x5, x1
-
-// Digits 8 and 9 of u (top is unsigned)
-
-        ldr     x7, [u+8*N]
-        eor     x1, x7, s00
-        and     x3, s00, m00
-        neg     x3, x3
-        mul     x0, x1, m00
-        umulh   x1, x1, m00
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-        ldr     x8, [v+8*N]
-        eor     x1, x8, s01
-        and     x0, s01, m01
-        sub     x3, x3, x0
-        mul     x0, x1, m01
-        umulh   x1, x1, m01
-        adds    x5, x5, x0
-        adc     x3, x3, x1
-
-// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3
-
-        extr    x6, x3, x5, #9
-        ldp     x10, x11, [u]
-        add     x6, x6, x3, asr #63
-        sub     x5, x5, x6, lsl #9
-        adds    x10, x10, x6
-        asr     x6, x6, #63
-        adcs    x11, x11, x6
-        ldp     x12, x13, [u+16]
-        adcs    x12, x12, x6
-        adcs    x13, x13, x6
-        ldp     x14, x15, [u+32]
-        adcs    x14, x14, x6
-        adcs    x15, x15, x6
-        ldp     x16, x17, [u+48]
-        adcs    x16, x16, x6
-        adcs    x17, x17, x6
-        adc     x19, x5, x6
-
-// Further strict reduction ready for the output, which just means
-// a conditional subtraction of p_521
-
-        subs    x0, x10, #-1
-        adcs    x1, x11, xzr
-        adcs    x2, x12, xzr
-        adcs    x3, x13, xzr
-        adcs    x4, x14, xzr
-        adcs    x5, x15, xzr
-        adcs    x6, x16, xzr
-        adcs    x7, x17, xzr
-        mov     x8, #0x1FF
-        sbcs    x8, x19, x8
-
-        csel    x0, x0, x10, cs
-        csel    x1, x1, x11, cs
-        csel    x2, x2, x12, cs
-        csel    x3, x3, x13, cs
-        csel    x4, x4, x14, cs
-        csel    x5, x5, x15, cs
-        csel    x6, x6, x16, cs
-        csel    x7, x7, x17, cs
-        csel    x8, x8, x19, cs
-
-// Store it back to the final output
-
-        stp     x0, x1, [res]
-        stp     x2, x3, [res, #16]
-        stp     x4, x5, [res, #32]
-        stp     x6, x7, [res, #48]
-        str     x8, [res, #64]
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p521/p521_jadd_alt.S b/third_party/s2n-bignum/arm/p521/p521_jadd_alt.S
deleted file mode 100644
index 72c9239be29..00000000000
--- a/third_party/s2n-bignum/arm/p521/p521_jadd_alt.S
+++ /dev/null
@@ -1,979 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Point addition on NIST curve P-521 in Jacobian coordinates
-//
-//    extern void p521_jadd_alt
-//      (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]);
-//
-// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
-// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
-// It is assumed that all coordinates of the input points p1 and p2 are
-// fully reduced mod p_521, that both z coordinates are nonzero and
-// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents
-// the same affine point as".
-//
-// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd_alt)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd_alt)
-
-        .text
-        .balign 4
-
-// Size of individual field elements
-
-#define NUMSIZE 72
-
-// Stable homes for input arguments during main code sequence
-
-#define input_z x26
-#define input_x x27
-#define input_y x28
-
-// Pointer-offset pairs for inputs and outputs
-
-#define x_1 input_x, #0
-#define y_1 input_x, #NUMSIZE
-#define z_1 input_x, #(2*NUMSIZE)
-
-#define x_2 input_y, #0
-#define y_2 input_y, #NUMSIZE
-#define z_2 input_y, #(2*NUMSIZE)
-
-#define x_3 input_z, #0
-#define y_3 input_z, #NUMSIZE
-#define z_3 input_z, #(2*NUMSIZE)
-
-// Pointer-offset pairs for temporaries, with some aliasing
-// NSPACE is the total stack needed for these temporaries
-
-#define z1sq sp, #(NUMSIZE*0)
-#define ww sp, #(NUMSIZE*0)
-#define resx sp, #(NUMSIZE*0)
-
-#define yd sp, #(NUMSIZE*1)
-#define y2a sp, #(NUMSIZE*1)
-
-#define x2a sp, #(NUMSIZE*2)
-#define zzx2 sp, #(NUMSIZE*2)
-
-#define zz sp, #(NUMSIZE*3)
-#define t1 sp, #(NUMSIZE*3)
-
-#define t2 sp, #(NUMSIZE*4)
-#define x1a sp, #(NUMSIZE*4)
-#define zzx1 sp, #(NUMSIZE*4)
-#define resy sp, #(NUMSIZE*4)
-
-#define xd sp, #(NUMSIZE*5)
-#define z2sq sp, #(NUMSIZE*5)
-#define resz sp, #(NUMSIZE*5)
-
-#define y1a sp, #(NUMSIZE*6)
-
-// NUMSIZE*7 is not 16-aligned so we round it up
-
-#define NSPACE (NUMSIZE*7+8)
-
-// Corresponds exactly to bignum_mul_p521_alt
-
-#define mul_p521(P0,P1,P2)                      \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        mul     x15, x3, x5;                    \
-        umulh   x16, x3, x5;                    \
-        mul     x14, x3, x6;                    \
-        umulh   x17, x3, x6;                    \
-        adds    x16, x16, x14;                  \
-        ldp     x7, x8, [P2+16];                \
-        mul     x14, x3, x7;                    \
-        umulh   x19, x3, x7;                    \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x8;                    \
-        umulh   x20, x3, x8;                    \
-        adcs    x19, x19, x14;                  \
-        ldp     x9, x10, [P2+32];               \
-        mul     x14, x3, x9;                    \
-        umulh   x21, x3, x9;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x10;                   \
-        umulh   x22, x3, x10;                   \
-        adcs    x21, x21, x14;                  \
-        ldp     x11, x12, [P2+48];              \
-        mul     x14, x3, x11;                   \
-        umulh   x23, x3, x11;                   \
-        adcs    x22, x22, x14;                  \
-        ldr     x13, [P2+64];                   \
-        mul     x14, x3, x12;                   \
-        umulh   x24, x3, x12;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x13;                   \
-        umulh   x1, x3, x13;                    \
-        adcs    x24, x24, x14;                  \
-        adc     x1, x1, xzr;                    \
-        mul     x14, x4, x5;                    \
-        adds    x16, x16, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x1, x1, x14;                    \
-        cset    x0, hs;                         \
-        umulh   x14, x4, x5;                    \
-        adds    x17, x17, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x13;                   \
-        adc     x0, x0, x14;                    \
-        stp     x15, x16, [P0];                 \
-        ldp     x3, x4, [P1+16];                \
-        mul     x14, x3, x5;                    \
-        adds    x17, x17, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x13;                   \
-        adcs    x0, x0, x14;                    \
-        cset    x15, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x19, x19, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x12;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x13;                   \
-        adc     x15, x15, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x19, x19, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x12;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x13;                   \
-        adcs    x15, x15, x14;                  \
-        cset    x16, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x20, x20, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x11;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x12;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x16, x16, x14;                  \
-        stp     x17, x19, [P0+16];              \
-        ldp     x3, x4, [P1+32];                \
-        mul     x14, x3, x5;                    \
-        adds    x20, x20, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x11;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x12;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x13;                   \
-        adcs    x16, x16, x14;                  \
-        cset    x17, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x21, x21, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x10;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x11;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x13;                   \
-        adc     x17, x17, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x21, x21, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x10;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x11;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x17, x17, x14;                  \
-        cset    x19, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x22, x22, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x9;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x10;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x19, x19, x14;                  \
-        stp     x20, x21, [P0+32];              \
-        ldp     x3, x4, [P1+48];                \
-        mul     x14, x3, x5;                    \
-        adds    x22, x22, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x9;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x10;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x13;                   \
-        adcs    x19, x19, x14;                  \
-        cset    x20, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x23, x23, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x8;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x9;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x3, x13;                   \
-        adc     x20, x20, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x23, x23, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x8;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x9;                    \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x20, x20, x14;                  \
-        cset    x21, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x24, x24, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x7;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x8;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x21, x21, x14;                  \
-        stp     x22, x23, [P0+48];              \
-        ldr     x3, [P1+64];                    \
-        mul     x14, x3, x5;                    \
-        adds    x24, x24, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x7;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x8;                    \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x13;                   \
-        adc     x21, x21, x14;                  \
-        umulh   x14, x3, x5;                    \
-        adds    x1, x1, x14;                    \
-        umulh   x14, x3, x6;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x7;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adc     x21, x21, x14;                  \
-        cmp     xzr, xzr;                       \
-        ldp     x5, x6, [P0];                   \
-        extr    x14, x1, x24, #9;               \
-        adcs    x5, x5, x14;                    \
-        extr    x14, x0, x1, #9;                \
-        adcs    x6, x6, x14;                    \
-        ldp     x7, x8, [P0+16];                \
-        extr    x14, x15, x0, #9;               \
-        adcs    x7, x7, x14;                    \
-        extr    x14, x16, x15, #9;              \
-        adcs    x8, x8, x14;                    \
-        ldp     x9, x10, [P0+32];               \
-        extr    x14, x17, x16, #9;              \
-        adcs    x9, x9, x14;                    \
-        extr    x14, x19, x17, #9;              \
-        adcs    x10, x10, x14;                  \
-        ldp     x11, x12, [P0+48];              \
-        extr    x14, x20, x19, #9;              \
-        adcs    x11, x11, x14;                  \
-        extr    x14, x21, x20, #9;              \
-        adcs    x12, x12, x14;                  \
-        orr     x13, x24, #0xfffffffffffffe00;  \
-        lsr     x14, x21, #9;                   \
-        adcs    x13, x13, x14;                  \
-        sbcs    x5, x5, xzr;                    \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        and     x13, x13, #0x1ff;               \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
-        str     x13, [P0+64]
-
-// Corresponds exactly to bignum_sqr_p521_alt
-
-#define sqr_p521(P0,P1)                         \
-        ldp     x2, x3, [P1];                   \
-        mul     x11, x2, x3;                    \
-        umulh   x12, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x10, x2, x4;                    \
-        umulh   x13, x2, x4;                    \
-        adds    x12, x12, x10;                  \
-        ldp     x6, x7, [P1+32];                \
-        mul     x10, x2, x5;                    \
-        umulh   x14, x2, x5;                    \
-        adcs    x13, x13, x10;                  \
-        ldp     x8, x9, [P1+48];                \
-        mul     x10, x2, x6;                    \
-        umulh   x15, x2, x6;                    \
-        adcs    x14, x14, x10;                  \
-        mul     x10, x2, x7;                    \
-        umulh   x16, x2, x7;                    \
-        adcs    x15, x15, x10;                  \
-        mul     x10, x2, x8;                    \
-        umulh   x17, x2, x8;                    \
-        adcs    x16, x16, x10;                  \
-        mul     x10, x2, x9;                    \
-        umulh   x19, x2, x9;                    \
-        adcs    x17, x17, x10;                  \
-        adc     x19, x19, xzr;                  \
-        mul     x10, x3, x4;                    \
-        adds    x13, x13, x10;                  \
-        mul     x10, x3, x5;                    \
-        adcs    x14, x14, x10;                  \
-        mul     x10, x3, x6;                    \
-        adcs    x15, x15, x10;                  \
-        mul     x10, x3, x7;                    \
-        adcs    x16, x16, x10;                  \
-        mul     x10, x3, x8;                    \
-        adcs    x17, x17, x10;                  \
-        mul     x10, x3, x9;                    \
-        adcs    x19, x19, x10;                  \
-        cset    x20, hs;                        \
-        umulh   x10, x3, x4;                    \
-        adds    x14, x14, x10;                  \
-        umulh   x10, x3, x5;                    \
-        adcs    x15, x15, x10;                  \
-        umulh   x10, x3, x6;                    \
-        adcs    x16, x16, x10;                  \
-        umulh   x10, x3, x7;                    \
-        adcs    x17, x17, x10;                  \
-        umulh   x10, x3, x8;                    \
-        adcs    x19, x19, x10;                  \
-        umulh   x10, x3, x9;                    \
-        adc     x20, x20, x10;                  \
-        mul     x10, x6, x7;                    \
-        umulh   x21, x6, x7;                    \
-        adds    x20, x20, x10;                  \
-        adc     x21, x21, xzr;                  \
-        mul     x10, x4, x5;                    \
-        adds    x15, x15, x10;                  \
-        mul     x10, x4, x6;                    \
-        adcs    x16, x16, x10;                  \
-        mul     x10, x4, x7;                    \
-        adcs    x17, x17, x10;                  \
-        mul     x10, x4, x8;                    \
-        adcs    x19, x19, x10;                  \
-        mul     x10, x4, x9;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x6, x8;                    \
-        adcs    x21, x21, x10;                  \
-        cset    x22, hs;                        \
-        umulh   x10, x4, x5;                    \
-        adds    x16, x16, x10;                  \
-        umulh   x10, x4, x6;                    \
-        adcs    x17, x17, x10;                  \
-        umulh   x10, x4, x7;                    \
-        adcs    x19, x19, x10;                  \
-        umulh   x10, x4, x8;                    \
-        adcs    x20, x20, x10;                  \
-        umulh   x10, x4, x9;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x6, x8;                    \
-        adc     x22, x22, x10;                  \
-        mul     x10, x7, x8;                    \
-        umulh   x23, x7, x8;                    \
-        adds    x22, x22, x10;                  \
-        adc     x23, x23, xzr;                  \
-        mul     x10, x5, x6;                    \
-        adds    x17, x17, x10;                  \
-        mul     x10, x5, x7;                    \
-        adcs    x19, x19, x10;                  \
-        mul     x10, x5, x8;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x5, x9;                    \
-        adcs    x21, x21, x10;                  \
-        mul     x10, x6, x9;                    \
-        adcs    x22, x22, x10;                  \
-        mul     x10, x7, x9;                    \
-        adcs    x23, x23, x10;                  \
-        cset    x24, hs;                        \
-        umulh   x10, x5, x6;                    \
-        adds    x19, x19, x10;                  \
-        umulh   x10, x5, x7;                    \
-        adcs    x20, x20, x10;                  \
-        umulh   x10, x5, x8;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x5, x9;                    \
-        adcs    x22, x22, x10;                  \
-        umulh   x10, x6, x9;                    \
-        adcs    x23, x23, x10;                  \
-        umulh   x10, x7, x9;                    \
-        adc     x24, x24, x10;                  \
-        mul     x10, x8, x9;                    \
-        umulh   x25, x8, x9;                    \
-        adds    x24, x24, x10;                  \
-        adc     x25, x25, xzr;                  \
-        adds    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adcs    x17, x17, x17;                  \
-        adcs    x19, x19, x19;                  \
-        adcs    x20, x20, x20;                  \
-        adcs    x21, x21, x21;                  \
-        adcs    x22, x22, x22;                  \
-        adcs    x23, x23, x23;                  \
-        adcs    x24, x24, x24;                  \
-        adcs    x25, x25, x25;                  \
-        cset    x0, hs;                         \
-        umulh   x10, x2, x2;                    \
-        adds    x11, x11, x10;                  \
-        mul     x10, x3, x3;                    \
-        adcs    x12, x12, x10;                  \
-        umulh   x10, x3, x3;                    \
-        adcs    x13, x13, x10;                  \
-        mul     x10, x4, x4;                    \
-        adcs    x14, x14, x10;                  \
-        umulh   x10, x4, x4;                    \
-        adcs    x15, x15, x10;                  \
-        mul     x10, x5, x5;                    \
-        adcs    x16, x16, x10;                  \
-        umulh   x10, x5, x5;                    \
-        adcs    x17, x17, x10;                  \
-        mul     x10, x6, x6;                    \
-        adcs    x19, x19, x10;                  \
-        umulh   x10, x6, x6;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x7, x7;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x7, x7;                    \
-        adcs    x22, x22, x10;                  \
-        mul     x10, x8, x8;                    \
-        adcs    x23, x23, x10;                  \
-        umulh   x10, x8, x8;                    \
-        adcs    x24, x24, x10;                  \
-        mul     x10, x9, x9;                    \
-        adcs    x25, x25, x10;                  \
-        umulh   x10, x9, x9;                    \
-        adc     x0, x0, x10;                    \
-        ldr     x1, [P1+64];                    \
-        add     x1, x1, x1;                     \
-        mul     x10, x1, x2;                    \
-        adds    x19, x19, x10;                  \
-        umulh   x10, x1, x2;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x1, x4;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x1, x4;                    \
-        adcs    x22, x22, x10;                  \
-        mul     x10, x1, x6;                    \
-        adcs    x23, x23, x10;                  \
-        umulh   x10, x1, x6;                    \
-        adcs    x24, x24, x10;                  \
-        mul     x10, x1, x8;                    \
-        adcs    x25, x25, x10;                  \
-        umulh   x10, x1, x8;                    \
-        adcs    x0, x0, x10;                    \
-        lsr     x4, x1, #1;                     \
-        mul     x4, x4, x4;                     \
-        adc     x4, x4, xzr;                    \
-        mul     x10, x1, x3;                    \
-        adds    x20, x20, x10;                  \
-        umulh   x10, x1, x3;                    \
-        adcs    x21, x21, x10;                  \
-        mul     x10, x1, x5;                    \
-        adcs    x22, x22, x10;                  \
-        umulh   x10, x1, x5;                    \
-        adcs    x23, x23, x10;                  \
-        mul     x10, x1, x7;                    \
-        adcs    x24, x24, x10;                  \
-        umulh   x10, x1, x7;                    \
-        adcs    x25, x25, x10;                  \
-        mul     x10, x1, x9;                    \
-        adcs    x0, x0, x10;                    \
-        umulh   x10, x1, x9;                    \
-        adc     x4, x4, x10;                    \
-        mul     x2, x2, x2;                     \
-        cmp     xzr, xzr;                       \
-        extr    x10, x20, x19, #9;              \
-        adcs    x2, x2, x10;                    \
-        extr    x10, x21, x20, #9;              \
-        adcs    x11, x11, x10;                  \
-        extr    x10, x22, x21, #9;              \
-        adcs    x12, x12, x10;                  \
-        extr    x10, x23, x22, #9;              \
-        adcs    x13, x13, x10;                  \
-        extr    x10, x24, x23, #9;              \
-        adcs    x14, x14, x10;                  \
-        extr    x10, x25, x24, #9;              \
-        adcs    x15, x15, x10;                  \
-        extr    x10, x0, x25, #9;               \
-        adcs    x16, x16, x10;                  \
-        extr    x10, x4, x0, #9;                \
-        adcs    x17, x17, x10;                  \
-        orr     x19, x19, #0xfffffffffffffe00;  \
-        lsr     x10, x4, #9;                    \
-        adcs    x19, x19, x10;                  \
-        sbcs    x2, x2, xzr;                    \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbcs    x15, x15, xzr;                  \
-        sbcs    x16, x16, xzr;                  \
-        sbcs    x17, x17, xzr;                  \
-        sbc     x19, x19, xzr;                  \
-        and     x19, x19, #0x1ff;               \
-        stp     x2, x11, [P0];                  \
-        stp     x12, x13, [P0+16];              \
-        stp     x14, x15, [P0+32];              \
-        stp     x16, x17, [P0+48];              \
-        str     x19, [P0+64]
-
-// Corresponds exactly to bignum_sub_p521
-
-#define sub_p521(P0,P1,P2)                      \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        ldp     x11, x12, [P1+48];              \
-        ldp     x4, x3, [P2+48];                \
-        sbcs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        ldr     x13, [P1+64];                   \
-        ldr     x4, [P2+64];                    \
-        sbcs    x13, x13, x4;                   \
-        sbcs    x5, x5, xzr;                    \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        and     x13, x13, #0x1ff;               \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
-        str     x13, [P0+64]
-
-S2N_BN_SYMBOL(p521_jadd_alt):
-
-// Save regs and make room on stack for temporary variables
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x26, [sp, #-16]!
-        stp     x27, x28, [sp, #-16]!
-        sub     sp, sp, NSPACE
-
-// Move the input arguments to stable places
-
-        mov     input_z, x0
-        mov     input_x, x1
-        mov     input_y, x2
-
-// Main code, just a sequence of basic field operations
-
-        sqr_p521(z1sq,z_1)
-        sqr_p521(z2sq,z_2)
-
-        mul_p521(y1a,z_2,y_1)
-        mul_p521(y2a,z_1,y_2)
-
-        mul_p521(x2a,z1sq,x_2)
-        mul_p521(x1a,z2sq,x_1)
-        mul_p521(y2a,z1sq,y2a)
-        mul_p521(y1a,z2sq,y1a)
-
-        sub_p521(xd,x2a,x1a)
-        sub_p521(yd,y2a,y1a)
-
-        sqr_p521(zz,xd)
-        sqr_p521(ww,yd)
-
-        mul_p521(zzx1,zz,x1a)
-        mul_p521(zzx2,zz,x2a)
-
-        sub_p521(resx,ww,zzx1)
-        sub_p521(t1,zzx2,zzx1)
-
-        mul_p521(xd,xd,z_1)
-
-        sub_p521(resx,resx,zzx2)
-
-        sub_p521(t2,zzx1,resx)
-
-        mul_p521(t1,t1,y1a)
-        mul_p521(resz,xd,z_2)
-        mul_p521(t2,yd,t2)
-
-        sub_p521(resy,t2,t1)
-
-// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
-// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
-// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
-// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
-// Multiplex the z outputs accordingly and re-store in resz
-
-        ldp     x0, x1, [z_1]
-        ldp     x2, x3, [z_1+16]
-        ldp     x4, x5, [z_1+32]
-        ldp     x6, x7, [z_1+48]
-        ldr     x8, [z_1+64]
-
-        orr     x20, x0, x1
-        orr     x21, x2, x3
-        orr     x22, x4, x5
-        orr     x23, x6, x7
-        orr     x20, x20, x21
-        orr     x22, x22, x23
-        orr     x20, x20, x8
-        orr     x20, x20, x22
-        cmp     x20, xzr
-        cset    x20, ne
-
-        ldp     x10, x11, [z_2]
-        ldp     x12, x13, [z_2+16]
-        ldp     x14, x15, [z_2+32]
-        ldp     x16, x17, [z_2+48]
-        ldr     x19, [z_2+64]
-
-        orr     x21, x10, x11
-        orr     x22, x12, x13
-        orr     x23, x14, x15
-        orr     x24, x16, x17
-        orr     x21, x21, x22
-        orr     x23, x23, x24
-        orr     x21, x21, x19
-        orr     x21, x21, x23
-
-        csel    x0, x0, x10, ne
-        csel    x1, x1, x11, ne
-        csel    x2, x2, x12, ne
-        csel    x3, x3, x13, ne
-        csel    x4, x4, x14, ne
-        csel    x5, x5, x15, ne
-        csel    x6, x6, x16, ne
-        csel    x7, x7, x17, ne
-        csel    x8, x8, x19, ne
-
-        cmp     x21, xzr
-        cset    x21, ne
-
-        cmp     x21, x20
-
-        ldp     x10, x11, [resz]
-        ldp     x12, x13, [resz+16]
-        ldp     x14, x15, [resz+32]
-        ldp     x16, x17, [resz+48]
-        ldr     x19, [resz+64]
-
-        csel    x0, x0, x10, ne
-        csel    x1, x1, x11, ne
-        csel    x2, x2, x12, ne
-        csel    x3, x3, x13, ne
-        csel    x4, x4, x14, ne
-        csel    x5, x5, x15, ne
-        csel    x6, x6, x16, ne
-        csel    x7, x7, x17, ne
-        csel    x8, x8, x19, ne
-
-        stp     x0, x1, [resz]
-        stp     x2, x3, [resz+16]
-        stp     x4, x5, [resz+32]
-        stp     x6, x7, [resz+48]
-        str     x8, [resz+64]
-
-// Multiplex the x and y outputs too, keeping the results in registers
-
-        ldp     x20, x21, [x_1]
-        ldp     x0, x1, [resx]
-        csel    x0, x20, x0, lo
-        csel    x1, x21, x1, lo
-        ldp     x20, x21, [x_2]
-        csel    x0, x20, x0, hi
-        csel    x1, x21, x1, hi
-
-        ldp     x20, x21, [x_1+16]
-        ldp     x2, x3, [resx+16]
-        csel    x2, x20, x2, lo
-        csel    x3, x21, x3, lo
-        ldp     x20, x21, [x_2+16]
-        csel    x2, x20, x2, hi
-        csel    x3, x21, x3, hi
-
-        ldp     x20, x21, [x_1+32]
-        ldp     x4, x5, [resx+32]
-        csel    x4, x20, x4, lo
-        csel    x5, x21, x5, lo
-        ldp     x20, x21, [x_2+32]
-        csel    x4, x20, x4, hi
-        csel    x5, x21, x5, hi
-
-        ldp     x20, x21, [x_1+48]
-        ldp     x6, x7, [resx+48]
-        csel    x6, x20, x6, lo
-        csel    x7, x21, x7, lo
-        ldp     x20, x21, [x_2+48]
-        csel    x6, x20, x6, hi
-        csel    x7, x21, x7, hi
-
-        ldr     x20, [x_1+64]
-        ldr     x8, [resx+64]
-        csel    x8, x20, x8, lo
-        ldr     x21, [x_2+64]
-        csel    x8, x21, x8, hi
-
-
-        ldp     x20, x21, [y_1]
-        ldp     x10, x11, [resy]
-        csel    x10, x20, x10, lo
-        csel    x11, x21, x11, lo
-        ldp     x20, x21, [y_2]
-        csel    x10, x20, x10, hi
-        csel    x11, x21, x11, hi
-
-        ldp     x20, x21, [y_1+16]
-        ldp     x12, x13, [resy+16]
-        csel    x12, x20, x12, lo
-        csel    x13, x21, x13, lo
-        ldp     x20, x21, [y_2+16]
-        csel    x12, x20, x12, hi
-        csel    x13, x21, x13, hi
-
-        ldp     x20, x21, [y_1+32]
-        ldp     x14, x15, [resy+32]
-        csel    x14, x20, x14, lo
-        csel    x15, x21, x15, lo
-        ldp     x20, x21, [y_2+32]
-        csel    x14, x20, x14, hi
-        csel    x15, x21, x15, hi
-
-        ldp     x20, x21, [y_1+48]
-        ldp     x16, x17, [resy+48]
-        csel    x16, x20, x16, lo
-        csel    x17, x21, x17, lo
-        ldp     x20, x21, [y_2+48]
-        csel    x16, x20, x16, hi
-        csel    x17, x21, x17, hi
-
-        ldr     x20, [y_1+64]
-        ldr     x19, [resy+64]
-        csel    x19, x20, x19, lo
-        ldr     x21, [y_2+64]
-        csel    x19, x21, x19, hi
-
-// Finally store back the multiplexed values
-
-        stp     x0, x1, [x_3]
-        stp     x2, x3, [x_3+16]
-        stp     x4, x5, [x_3+32]
-        stp     x6, x7, [x_3+48]
-        str     x8, [x_3+64]
-
-        ldp     x0, x1, [resz]
-        ldp     x2, x3, [resz+16]
-        ldp     x4, x5, [resz+32]
-        ldp     x6, x7, [resz+48]
-        ldr     x8, [resz+64]
-
-        stp     x10, x11, [y_3]
-        stp     x12, x13, [y_3+16]
-        stp     x14, x15, [y_3+32]
-        stp     x16, x17, [y_3+48]
-        str     x19, [y_3+64]
-
-        stp     x0, x1, [z_3]
-        stp     x2, x3, [z_3+16]
-        stp     x4, x5, [z_3+32]
-        stp     x6, x7, [z_3+48]
-        str     x8, [z_3+64]
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-
-        ldp     x27, x28, [sp], 16
-        ldp     x25, x26, [sp], 16
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p521/p521_jdouble_alt.S b/third_party/s2n-bignum/arm/p521/p521_jdouble_alt.S
deleted file mode 100644
index fa61dcf8d9e..00000000000
--- a/third_party/s2n-bignum/arm/p521/p521_jdouble_alt.S
+++ /dev/null
@@ -1,1458 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Point doubling on NIST curve P-521 in Jacobian coordinates
-//
-//    extern void p521_jdouble_alt
-//      (uint64_t p3[static 27],uint64_t p1[static 27]);
-//
-// Does p3 := 2 * p1 where all points are regarded as Jacobian triples.
-// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
-// It is assumed that all coordinates of the input point are fully
-// reduced mod p_521 and that the z coordinate is not zero.
-//
-// Standard ARM ABI: X0 = p3, X1 = p1
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble_alt)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble_alt)
-
-        .text
-        .balign 4
-
-// Size of individual field elements
-
-#define NUMSIZE 72
-
-// Stable homes for input arguments during main code sequence
-
-#define input_z x26
-#define input_x x27
-
-// Pointer-offset pairs for inputs and outputs
-
-#define x_1 input_x, #0
-#define y_1 input_x, #NUMSIZE
-#define z_1 input_x, #(2*NUMSIZE)
-
-#define x_3 input_z, #0
-#define y_3 input_z, #NUMSIZE
-#define z_3 input_z, #(2*NUMSIZE)
-
-// Pointer-offset pairs for temporaries
-
-#define z2 sp, #(NUMSIZE*0)
-#define y2 sp, #(NUMSIZE*1)
-#define x2p sp, #(NUMSIZE*2)
-#define xy2 sp, #(NUMSIZE*3)
-
-#define y4 sp, #(NUMSIZE*4)
-#define t2 sp, #(NUMSIZE*4)
-
-#define dx2 sp, #(NUMSIZE*5)
-#define t1 sp, #(NUMSIZE*5)
-
-#define d sp, #(NUMSIZE*6)
-#define x4p sp, #(NUMSIZE*6)
-
-// NUMSIZE*7 is not 16-aligned so we round it up
-
-#define NSPACE (NUMSIZE*7+8)
-
-// Corresponds exactly to bignum_mul_p521_alt
-
-#define mul_p521(P0,P1,P2)                      \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        mul     x15, x3, x5;                    \
-        umulh   x16, x3, x5;                    \
-        mul     x14, x3, x6;                    \
-        umulh   x17, x3, x6;                    \
-        adds    x16, x16, x14;                  \
-        ldp     x7, x8, [P2+16];                \
-        mul     x14, x3, x7;                    \
-        umulh   x19, x3, x7;                    \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x8;                    \
-        umulh   x20, x3, x8;                    \
-        adcs    x19, x19, x14;                  \
-        ldp     x9, x10, [P2+32];               \
-        mul     x14, x3, x9;                    \
-        umulh   x21, x3, x9;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x10;                   \
-        umulh   x22, x3, x10;                   \
-        adcs    x21, x21, x14;                  \
-        ldp     x11, x12, [P2+48];              \
-        mul     x14, x3, x11;                   \
-        umulh   x23, x3, x11;                   \
-        adcs    x22, x22, x14;                  \
-        ldr     x13, [P2+64];                   \
-        mul     x14, x3, x12;                   \
-        umulh   x24, x3, x12;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x13;                   \
-        umulh   x1, x3, x13;                    \
-        adcs    x24, x24, x14;                  \
-        adc     x1, x1, xzr;                    \
-        mul     x14, x4, x5;                    \
-        adds    x16, x16, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x1, x1, x14;                    \
-        cset    x0, hs;                         \
-        umulh   x14, x4, x5;                    \
-        adds    x17, x17, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x13;                   \
-        adc     x0, x0, x14;                    \
-        stp     x15, x16, [P0];                 \
-        ldp     x3, x4, [P1+16];                \
-        mul     x14, x3, x5;                    \
-        adds    x17, x17, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x13;                   \
-        adcs    x0, x0, x14;                    \
-        cset    x15, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x19, x19, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x12;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x13;                   \
-        adc     x15, x15, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x19, x19, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x12;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x13;                   \
-        adcs    x15, x15, x14;                  \
-        cset    x16, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x20, x20, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x11;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x12;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x16, x16, x14;                  \
-        stp     x17, x19, [P0+16];              \
-        ldp     x3, x4, [P1+32];                \
-        mul     x14, x3, x5;                    \
-        adds    x20, x20, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x11;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x12;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x13;                   \
-        adcs    x16, x16, x14;                  \
-        cset    x17, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x21, x21, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x10;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x11;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x13;                   \
-        adc     x17, x17, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x21, x21, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x10;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x11;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x17, x17, x14;                  \
-        cset    x19, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x22, x22, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x9;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x10;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x19, x19, x14;                  \
-        stp     x20, x21, [P0+32];              \
-        ldp     x3, x4, [P1+48];                \
-        mul     x14, x3, x5;                    \
-        adds    x22, x22, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x9;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x10;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x13;                   \
-        adcs    x19, x19, x14;                  \
-        cset    x20, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x23, x23, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x8;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x9;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x3, x13;                   \
-        adc     x20, x20, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x23, x23, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x8;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x9;                    \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x20, x20, x14;                  \
-        cset    x21, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x24, x24, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x7;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x8;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x21, x21, x14;                  \
-        stp     x22, x23, [P0+48];              \
-        ldr     x3, [P1+64];                    \
-        mul     x14, x3, x5;                    \
-        adds    x24, x24, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x7;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x8;                    \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x13;                   \
-        adc     x21, x21, x14;                  \
-        umulh   x14, x3, x5;                    \
-        adds    x1, x1, x14;                    \
-        umulh   x14, x3, x6;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x7;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adc     x21, x21, x14;                  \
-        cmp     xzr, xzr;                       \
-        ldp     x5, x6, [P0];                   \
-        extr    x14, x1, x24, #9;               \
-        adcs    x5, x5, x14;                    \
-        extr    x14, x0, x1, #9;                \
-        adcs    x6, x6, x14;                    \
-        ldp     x7, x8, [P0+16];                \
-        extr    x14, x15, x0, #9;               \
-        adcs    x7, x7, x14;                    \
-        extr    x14, x16, x15, #9;              \
-        adcs    x8, x8, x14;                    \
-        ldp     x9, x10, [P0+32];               \
-        extr    x14, x17, x16, #9;              \
-        adcs    x9, x9, x14;                    \
-        extr    x14, x19, x17, #9;              \
-        adcs    x10, x10, x14;                  \
-        ldp     x11, x12, [P0+48];              \
-        extr    x14, x20, x19, #9;              \
-        adcs    x11, x11, x14;                  \
-        extr    x14, x21, x20, #9;              \
-        adcs    x12, x12, x14;                  \
-        orr     x13, x24, #0xfffffffffffffe00;  \
-        lsr     x14, x21, #9;                   \
-        adcs    x13, x13, x14;                  \
-        sbcs    x5, x5, xzr;                    \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        and     x13, x13, #0x1ff;               \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
-        str     x13, [P0+64]
-
-// Corresponds exactly to bignum_sqr_p521_alt
-
-#define sqr_p521(P0,P1)                         \
-        ldp     x2, x3, [P1];                   \
-        mul     x11, x2, x3;                    \
-        umulh   x12, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x10, x2, x4;                    \
-        umulh   x13, x2, x4;                    \
-        adds    x12, x12, x10;                  \
-        ldp     x6, x7, [P1+32];                \
-        mul     x10, x2, x5;                    \
-        umulh   x14, x2, x5;                    \
-        adcs    x13, x13, x10;                  \
-        ldp     x8, x9, [P1+48];                \
-        mul     x10, x2, x6;                    \
-        umulh   x15, x2, x6;                    \
-        adcs    x14, x14, x10;                  \
-        mul     x10, x2, x7;                    \
-        umulh   x16, x2, x7;                    \
-        adcs    x15, x15, x10;                  \
-        mul     x10, x2, x8;                    \
-        umulh   x17, x2, x8;                    \
-        adcs    x16, x16, x10;                  \
-        mul     x10, x2, x9;                    \
-        umulh   x19, x2, x9;                    \
-        adcs    x17, x17, x10;                  \
-        adc     x19, x19, xzr;                  \
-        mul     x10, x3, x4;                    \
-        adds    x13, x13, x10;                  \
-        mul     x10, x3, x5;                    \
-        adcs    x14, x14, x10;                  \
-        mul     x10, x3, x6;                    \
-        adcs    x15, x15, x10;                  \
-        mul     x10, x3, x7;                    \
-        adcs    x16, x16, x10;                  \
-        mul     x10, x3, x8;                    \
-        adcs    x17, x17, x10;                  \
-        mul     x10, x3, x9;                    \
-        adcs    x19, x19, x10;                  \
-        cset    x20, hs;                        \
-        umulh   x10, x3, x4;                    \
-        adds    x14, x14, x10;                  \
-        umulh   x10, x3, x5;                    \
-        adcs    x15, x15, x10;                  \
-        umulh   x10, x3, x6;                    \
-        adcs    x16, x16, x10;                  \
-        umulh   x10, x3, x7;                    \
-        adcs    x17, x17, x10;                  \
-        umulh   x10, x3, x8;                    \
-        adcs    x19, x19, x10;                  \
-        umulh   x10, x3, x9;                    \
-        adc     x20, x20, x10;                  \
-        mul     x10, x6, x7;                    \
-        umulh   x21, x6, x7;                    \
-        adds    x20, x20, x10;                  \
-        adc     x21, x21, xzr;                  \
-        mul     x10, x4, x5;                    \
-        adds    x15, x15, x10;                  \
-        mul     x10, x4, x6;                    \
-        adcs    x16, x16, x10;                  \
-        mul     x10, x4, x7;                    \
-        adcs    x17, x17, x10;                  \
-        mul     x10, x4, x8;                    \
-        adcs    x19, x19, x10;                  \
-        mul     x10, x4, x9;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x6, x8;                    \
-        adcs    x21, x21, x10;                  \
-        cset    x22, hs;                        \
-        umulh   x10, x4, x5;                    \
-        adds    x16, x16, x10;                  \
-        umulh   x10, x4, x6;                    \
-        adcs    x17, x17, x10;                  \
-        umulh   x10, x4, x7;                    \
-        adcs    x19, x19, x10;                  \
-        umulh   x10, x4, x8;                    \
-        adcs    x20, x20, x10;                  \
-        umulh   x10, x4, x9;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x6, x8;                    \
-        adc     x22, x22, x10;                  \
-        mul     x10, x7, x8;                    \
-        umulh   x23, x7, x8;                    \
-        adds    x22, x22, x10;                  \
-        adc     x23, x23, xzr;                  \
-        mul     x10, x5, x6;                    \
-        adds    x17, x17, x10;                  \
-        mul     x10, x5, x7;                    \
-        adcs    x19, x19, x10;                  \
-        mul     x10, x5, x8;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x5, x9;                    \
-        adcs    x21, x21, x10;                  \
-        mul     x10, x6, x9;                    \
-        adcs    x22, x22, x10;                  \
-        mul     x10, x7, x9;                    \
-        adcs    x23, x23, x10;                  \
-        cset    x24, hs;                        \
-        umulh   x10, x5, x6;                    \
-        adds    x19, x19, x10;                  \
-        umulh   x10, x5, x7;                    \
-        adcs    x20, x20, x10;                  \
-        umulh   x10, x5, x8;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x5, x9;                    \
-        adcs    x22, x22, x10;                  \
-        umulh   x10, x6, x9;                    \
-        adcs    x23, x23, x10;                  \
-        umulh   x10, x7, x9;                    \
-        adc     x24, x24, x10;                  \
-        mul     x10, x8, x9;                    \
-        umulh   x25, x8, x9;                    \
-        adds    x24, x24, x10;                  \
-        adc     x25, x25, xzr;                  \
-        adds    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adcs    x17, x17, x17;                  \
-        adcs    x19, x19, x19;                  \
-        adcs    x20, x20, x20;                  \
-        adcs    x21, x21, x21;                  \
-        adcs    x22, x22, x22;                  \
-        adcs    x23, x23, x23;                  \
-        adcs    x24, x24, x24;                  \
-        adcs    x25, x25, x25;                  \
-        cset    x0, hs;                         \
-        umulh   x10, x2, x2;                    \
-        adds    x11, x11, x10;                  \
-        mul     x10, x3, x3;                    \
-        adcs    x12, x12, x10;                  \
-        umulh   x10, x3, x3;                    \
-        adcs    x13, x13, x10;                  \
-        mul     x10, x4, x4;                    \
-        adcs    x14, x14, x10;                  \
-        umulh   x10, x4, x4;                    \
-        adcs    x15, x15, x10;                  \
-        mul     x10, x5, x5;                    \
-        adcs    x16, x16, x10;                  \
-        umulh   x10, x5, x5;                    \
-        adcs    x17, x17, x10;                  \
-        mul     x10, x6, x6;                    \
-        adcs    x19, x19, x10;                  \
-        umulh   x10, x6, x6;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x7, x7;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x7, x7;                    \
-        adcs    x22, x22, x10;                  \
-        mul     x10, x8, x8;                    \
-        adcs    x23, x23, x10;                  \
-        umulh   x10, x8, x8;                    \
-        adcs    x24, x24, x10;                  \
-        mul     x10, x9, x9;                    \
-        adcs    x25, x25, x10;                  \
-        umulh   x10, x9, x9;                    \
-        adc     x0, x0, x10;                    \
-        ldr     x1, [P1+64];                    \
-        add     x1, x1, x1;                     \
-        mul     x10, x1, x2;                    \
-        adds    x19, x19, x10;                  \
-        umulh   x10, x1, x2;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x1, x4;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x1, x4;                    \
-        adcs    x22, x22, x10;                  \
-        mul     x10, x1, x6;                    \
-        adcs    x23, x23, x10;                  \
-        umulh   x10, x1, x6;                    \
-        adcs    x24, x24, x10;                  \
-        mul     x10, x1, x8;                    \
-        adcs    x25, x25, x10;                  \
-        umulh   x10, x1, x8;                    \
-        adcs    x0, x0, x10;                    \
-        lsr     x4, x1, #1;                     \
-        mul     x4, x4, x4;                     \
-        adc     x4, x4, xzr;                    \
-        mul     x10, x1, x3;                    \
-        adds    x20, x20, x10;                  \
-        umulh   x10, x1, x3;                    \
-        adcs    x21, x21, x10;                  \
-        mul     x10, x1, x5;                    \
-        adcs    x22, x22, x10;                  \
-        umulh   x10, x1, x5;                    \
-        adcs    x23, x23, x10;                  \
-        mul     x10, x1, x7;                    \
-        adcs    x24, x24, x10;                  \
-        umulh   x10, x1, x7;                    \
-        adcs    x25, x25, x10;                  \
-        mul     x10, x1, x9;                    \
-        adcs    x0, x0, x10;                    \
-        umulh   x10, x1, x9;                    \
-        adc     x4, x4, x10;                    \
-        mul     x2, x2, x2;                     \
-        cmp     xzr, xzr;                       \
-        extr    x10, x20, x19, #9;              \
-        adcs    x2, x2, x10;                    \
-        extr    x10, x21, x20, #9;              \
-        adcs    x11, x11, x10;                  \
-        extr    x10, x22, x21, #9;              \
-        adcs    x12, x12, x10;                  \
-        extr    x10, x23, x22, #9;              \
-        adcs    x13, x13, x10;                  \
-        extr    x10, x24, x23, #9;              \
-        adcs    x14, x14, x10;                  \
-        extr    x10, x25, x24, #9;              \
-        adcs    x15, x15, x10;                  \
-        extr    x10, x0, x25, #9;               \
-        adcs    x16, x16, x10;                  \
-        extr    x10, x4, x0, #9;                \
-        adcs    x17, x17, x10;                  \
-        orr     x19, x19, #0xfffffffffffffe00;  \
-        lsr     x10, x4, #9;                    \
-        adcs    x19, x19, x10;                  \
-        sbcs    x2, x2, xzr;                    \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbcs    x15, x15, xzr;                  \
-        sbcs    x16, x16, xzr;                  \
-        sbcs    x17, x17, xzr;                  \
-        sbc     x19, x19, xzr;                  \
-        and     x19, x19, #0x1ff;               \
-        stp     x2, x11, [P0];                  \
-        stp     x12, x13, [P0+16];              \
-        stp     x14, x15, [P0+32];              \
-        stp     x16, x17, [P0+48];              \
-        str     x19, [P0+64]
-
-// Corresponds exactly to bignum_add_p521
-
-#define add_p521(P0,P1,P2)                      \
-        cmp     xzr, xzr;                       \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        adcs    x5, x5, x4;                     \
-        adcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        adcs    x7, x7, x4;                     \
-        adcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        adcs    x9, x9, x4;                     \
-        adcs    x10, x10, x3;                   \
-        ldp     x11, x12, [P1+48];              \
-        ldp     x4, x3, [P2+48];                \
-        adcs    x11, x11, x4;                   \
-        adcs    x12, x12, x3;                   \
-        ldr     x13, [P1+64];                   \
-        ldr     x4, [P2+64];                    \
-        adc     x13, x13, x4;                   \
-        subs    x4, x13, #512;                  \
-        csetm   x4, hs;                         \
-        sbcs    x5, x5, xzr;                    \
-        and     x4, x4, #0x200;                 \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, x4;                   \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
-        str     x13, [P0+64]
-
-// Corresponds exactly to bignum_sub_p521
-
-#define sub_p521(P0,P1,P2)                      \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        ldp     x11, x12, [P1+48];              \
-        ldp     x4, x3, [P2+48];                \
-        sbcs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        ldr     x13, [P1+64];                   \
-        ldr     x4, [P2+64];                    \
-        sbcs    x13, x13, x4;                   \
-        sbcs    x5, x5, xzr;                    \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        and     x13, x13, #0x1ff;               \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
-        str     x13, [P0+64]
-
-// Weak multiplication not fully reducing
-
-#define weakmul_p521(P0,P1,P2)                  \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        mul     x15, x3, x5;                    \
-        umulh   x16, x3, x5;                    \
-        mul     x14, x3, x6;                    \
-        umulh   x17, x3, x6;                    \
-        adds    x16, x16, x14;                  \
-        ldp     x7, x8, [P2+16];                \
-        mul     x14, x3, x7;                    \
-        umulh   x19, x3, x7;                    \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x8;                    \
-        umulh   x20, x3, x8;                    \
-        adcs    x19, x19, x14;                  \
-        ldp     x9, x10, [P2+32];               \
-        mul     x14, x3, x9;                    \
-        umulh   x21, x3, x9;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x10;                   \
-        umulh   x22, x3, x10;                   \
-        adcs    x21, x21, x14;                  \
-        ldp     x11, x12, [P2+48];              \
-        mul     x14, x3, x11;                   \
-        umulh   x23, x3, x11;                   \
-        adcs    x22, x22, x14;                  \
-        ldr     x13, [P2+64];                   \
-        mul     x14, x3, x12;                   \
-        umulh   x24, x3, x12;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x13;                   \
-        umulh   x1, x3, x13;                    \
-        adcs    x24, x24, x14;                  \
-        adc     x1, x1, xzr;                    \
-        mul     x14, x4, x5;                    \
-        adds    x16, x16, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x1, x1, x14;                    \
-        cset    x0, hs;                         \
-        umulh   x14, x4, x5;                    \
-        adds    x17, x17, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x13;                   \
-        adc     x0, x0, x14;                    \
-        stp     x15, x16, [P0];                 \
-        ldp     x3, x4, [P1+16];                \
-        mul     x14, x3, x5;                    \
-        adds    x17, x17, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x13;                   \
-        adcs    x0, x0, x14;                    \
-        cset    x15, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x19, x19, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x12;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x13;                   \
-        adc     x15, x15, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x19, x19, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x12;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x13;                   \
-        adcs    x15, x15, x14;                  \
-        cset    x16, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x20, x20, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x11;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x12;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x16, x16, x14;                  \
-        stp     x17, x19, [P0+16];              \
-        ldp     x3, x4, [P1+32];                \
-        mul     x14, x3, x5;                    \
-        adds    x20, x20, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x11;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x12;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x13;                   \
-        adcs    x16, x16, x14;                  \
-        cset    x17, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x21, x21, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x10;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x11;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x13;                   \
-        adc     x17, x17, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x21, x21, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x10;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x11;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x17, x17, x14;                  \
-        cset    x19, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x22, x22, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x9;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x10;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x19, x19, x14;                  \
-        stp     x20, x21, [P0+32];              \
-        ldp     x3, x4, [P1+48];                \
-        mul     x14, x3, x5;                    \
-        adds    x22, x22, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x9;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x10;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x13;                   \
-        adcs    x19, x19, x14;                  \
-        cset    x20, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x23, x23, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x8;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x9;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x3, x13;                   \
-        adc     x20, x20, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x23, x23, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x8;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x9;                    \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x20, x20, x14;                  \
-        cset    x21, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x24, x24, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x7;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x8;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x21, x21, x14;                  \
-        stp     x22, x23, [P0+48];              \
-        ldr     x3, [P1+64];                    \
-        mul     x14, x3, x5;                    \
-        adds    x24, x24, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x7;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x8;                    \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x13;                   \
-        adc     x21, x21, x14;                  \
-        umulh   x14, x3, x5;                    \
-        adds    x1, x1, x14;                    \
-        umulh   x14, x3, x6;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x7;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adc     x21, x21, x14;                  \
-        ldp     x5, x6, [P0];                   \
-        extr    x14, x1, x24, #9;               \
-        adds    x5, x5, x14;                    \
-        extr    x14, x0, x1, #9;                \
-        adcs    x6, x6, x14;                    \
-        ldp     x7, x8, [P0+16];                \
-        extr    x14, x15, x0, #9;               \
-        adcs    x7, x7, x14;                    \
-        extr    x14, x16, x15, #9;              \
-        adcs    x8, x8, x14;                    \
-        ldp     x9, x10, [P0+32];               \
-        extr    x14, x17, x16, #9;              \
-        adcs    x9, x9, x14;                    \
-        extr    x14, x19, x17, #9;              \
-        adcs    x10, x10, x14;                  \
-        ldp     x11, x12, [P0+48];              \
-        extr    x14, x20, x19, #9;              \
-        adcs    x11, x11, x14;                  \
-        extr    x14, x21, x20, #9;              \
-        adcs    x12, x12, x14;                  \
-        and     x13, x24, #0x1ff;               \
-        lsr     x14, x21, #9;                   \
-        adc     x13, x13, x14;                  \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
-        str     x13, [P0+64]
-
-// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2)
-
-#define cmsub_p521(P0,C,P1,D,P2)                \
-        ldp     x6, x7, [P1];                   \
-        mov     x1, #(C);                       \
-        mul     x3, x1, x6;                     \
-        mul     x4, x1, x7;                     \
-        umulh   x6, x1, x6;                     \
-        adds    x4, x4, x6;                     \
-        umulh   x7, x1, x7;                     \
-        ldp     x8, x9, [P1+16];                \
-        mul     x5, x1, x8;                     \
-        mul     x6, x1, x9;                     \
-        umulh   x8, x1, x8;                     \
-        adcs    x5, x5, x7;                     \
-        umulh   x9, x1, x9;                     \
-        adcs    x6, x6, x8;                     \
-        ldp     x10, x11, [P1+32];              \
-        mul     x7, x1, x10;                    \
-        mul     x8, x1, x11;                    \
-        umulh   x10, x1, x10;                   \
-        adcs    x7, x7, x9;                     \
-        umulh   x11, x1, x11;                   \
-        adcs    x8, x8, x10;                    \
-        ldp     x12, x13, [P1+48];              \
-        mul     x9, x1, x12;                    \
-        mul     x10, x1, x13;                   \
-        umulh   x12, x1, x12;                   \
-        adcs    x9, x9, x11;                    \
-        umulh   x13, x1, x13;                   \
-        adcs    x10, x10, x12;                  \
-        ldr     x14, [P1+64];                   \
-        mul     x11, x1, x14;                   \
-        adc     x11, x11, x13;                  \
-        mov     x1, #(D);                       \
-        ldp     x20, x21, [P2];                 \
-        mvn     x20, x20;                       \
-        mul     x0, x1, x20;                    \
-        umulh   x20, x1, x20;                   \
-        adds    x3, x3, x0;                     \
-        mvn     x21, x21;                       \
-        mul     x0, x1, x21;                    \
-        umulh   x21, x1, x21;                   \
-        adcs    x4, x4, x0;                     \
-        ldp     x22, x23, [P2+16];              \
-        mvn     x22, x22;                       \
-        mul     x0, x1, x22;                    \
-        umulh   x22, x1, x22;                   \
-        adcs    x5, x5, x0;                     \
-        mvn     x23, x23;                       \
-        mul     x0, x1, x23;                    \
-        umulh   x23, x1, x23;                   \
-        adcs    x6, x6, x0;                     \
-        ldp     x17, x19, [P2+32];              \
-        mvn     x17, x17;                       \
-        mul     x0, x1, x17;                    \
-        umulh   x17, x1, x17;                   \
-        adcs    x7, x7, x0;                     \
-        mvn     x19, x19;                       \
-        mul     x0, x1, x19;                    \
-        umulh   x19, x1, x19;                   \
-        adcs    x8, x8, x0;                     \
-        ldp     x2, x16, [P2+48];               \
-        mvn     x2, x2;                         \
-        mul     x0, x1, x2;                     \
-        umulh   x2, x1, x2;                     \
-        adcs    x9, x9, x0;                     \
-        mvn     x16, x16;                       \
-        mul     x0, x1, x16;                    \
-        umulh   x16, x1, x16;                   \
-        adcs    x10, x10, x0;                   \
-        ldr     x0, [P2+64];                    \
-        eor     x0, x0, #0x1ff;                 \
-        mul     x0, x1, x0;                     \
-        adc     x11, x11, x0;                   \
-        adds    x4, x4, x20;                    \
-        adcs    x5, x5, x21;                    \
-        and     x15, x4, x5;                    \
-        adcs    x6, x6, x22;                    \
-        and     x15, x15, x6;                   \
-        adcs    x7, x7, x23;                    \
-        and     x15, x15, x7;                   \
-        adcs    x8, x8, x17;                    \
-        and     x15, x15, x8;                   \
-        adcs    x9, x9, x19;                    \
-        and     x15, x15, x9;                   \
-        adcs    x10, x10, x2;                   \
-        and     x15, x15, x10;                  \
-        adc     x11, x11, x16;                  \
-        lsr     x12, x11, #9;                   \
-        orr     x11, x11, #0xfffffffffffffe00;  \
-        cmp     xzr, xzr;                       \
-        adcs    xzr, x3, x12;                   \
-        adcs    xzr, x15, xzr;                  \
-        adcs    xzr, x11, xzr;                  \
-        adcs    x3, x3, x12;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adcs    x6, x6, xzr;                    \
-        adcs    x7, x7, xzr;                    \
-        adcs    x8, x8, xzr;                    \
-        adcs    x9, x9, xzr;                    \
-        adcs    x10, x10, xzr;                  \
-        adc     x11, x11, xzr;                  \
-        and     x11, x11, #0x1ff;               \
-        stp     x3, x4, [P0];                   \
-        stp     x5, x6, [P0+16];                \
-        stp     x7, x8, [P0+32];                \
-        stp     x9, x10, [P0+48];               \
-        str     x11, [P0+64]
-
-// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2)
-
-#define cmsub38_p521(P0,P1,P2)                  \
-        ldp     x6, x7, [P1];                   \
-        lsl     x3, x6, #1;                     \
-        adds    x3, x3, x6;                     \
-        extr    x4, x7, x6, #63;                \
-        adcs    x4, x4, x7;                     \
-        ldp     x8, x9, [P1+16];                \
-        extr    x5, x8, x7, #63;                \
-        adcs    x5, x5, x8;                     \
-        extr    x6, x9, x8, #63;                \
-        adcs    x6, x6, x9;                     \
-        ldp     x10, x11, [P1+32];              \
-        extr    x7, x10, x9, #63;               \
-        adcs    x7, x7, x10;                    \
-        extr    x8, x11, x10, #63;              \
-        adcs    x8, x8, x11;                    \
-        ldp     x12, x13, [P1+48];              \
-        extr    x9, x12, x11, #63;              \
-        adcs    x9, x9, x12;                    \
-        extr    x10, x13, x12, #63;             \
-        adcs    x10, x10, x13;                  \
-        ldr     x14, [P1+64];                   \
-        extr    x11, x14, x13, #63;             \
-        adc     x11, x11, x14;                  \
-        ldp     x20, x21, [P2];                 \
-        mvn     x20, x20;                       \
-        lsl     x0, x20, #3;                    \
-        adds    x3, x3, x0;                     \
-        mvn     x21, x21;                       \
-        extr    x0, x21, x20, #61;              \
-        adcs    x4, x4, x0;                     \
-        ldp     x22, x23, [P2+16];              \
-        mvn     x22, x22;                       \
-        extr    x0, x22, x21, #61;              \
-        adcs    x5, x5, x0;                     \
-        and     x15, x4, x5;                    \
-        mvn     x23, x23;                       \
-        extr    x0, x23, x22, #61;              \
-        adcs    x6, x6, x0;                     \
-        and     x15, x15, x6;                   \
-        ldp     x20, x21, [P2+32];              \
-        mvn     x20, x20;                       \
-        extr    x0, x20, x23, #61;              \
-        adcs    x7, x7, x0;                     \
-        and     x15, x15, x7;                   \
-        mvn     x21, x21;                       \
-        extr    x0, x21, x20, #61;              \
-        adcs    x8, x8, x0;                     \
-        and     x15, x15, x8;                   \
-        ldp     x22, x23, [P2+48];              \
-        mvn     x22, x22;                       \
-        extr    x0, x22, x21, #61;              \
-        adcs    x9, x9, x0;                     \
-        and     x15, x15, x9;                   \
-        mvn     x23, x23;                       \
-        extr    x0, x23, x22, #61;              \
-        adcs    x10, x10, x0;                   \
-        and     x15, x15, x10;                  \
-        ldr     x0, [P2+64];                    \
-        eor     x0, x0, #0x1ff;                 \
-        extr    x0, x0, x23, #61;               \
-        adc     x11, x11, x0;                   \
-        lsr     x12, x11, #9;                   \
-        orr     x11, x11, #0xfffffffffffffe00;  \
-        cmp     xzr, xzr;                       \
-        adcs    xzr, x3, x12;                   \
-        adcs    xzr, x15, xzr;                  \
-        adcs    xzr, x11, xzr;                  \
-        adcs    x3, x3, x12;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adcs    x6, x6, xzr;                    \
-        adcs    x7, x7, xzr;                    \
-        adcs    x8, x8, xzr;                    \
-        adcs    x9, x9, xzr;                    \
-        adcs    x10, x10, xzr;                  \
-        adc     x11, x11, xzr;                  \
-        and     x11, x11, #0x1ff;               \
-        stp     x3, x4, [P0];                   \
-        stp     x5, x6, [P0+16];                \
-        stp     x7, x8, [P0+32];                \
-        stp     x9, x10, [P0+48];               \
-        str     x11, [P0+64]
-
-// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2)
-
-#define cmsub41_p521(P0,P1,P2)                  \
-        ldp     x6, x7, [P1];                   \
-        lsl     x3, x6, #2;                     \
-        extr    x4, x7, x6, #62;                \
-        ldp     x8, x9, [P1+16];                \
-        extr    x5, x8, x7, #62;                \
-        extr    x6, x9, x8, #62;                \
-        ldp     x10, x11, [P1+32];              \
-        extr    x7, x10, x9, #62;               \
-        extr    x8, x11, x10, #62;              \
-        ldp     x12, x13, [P1+48];              \
-        extr    x9, x12, x11, #62;              \
-        extr    x10, x13, x12, #62;             \
-        ldr     x14, [P1+64];                   \
-        extr    x11, x14, x13, #62;             \
-        ldp     x0, x1, [P2];                   \
-        mvn     x0, x0;                         \
-        adds    x3, x3, x0;                     \
-        sbcs    x4, x4, x1;                     \
-        ldp     x0, x1, [P2+16];                \
-        sbcs    x5, x5, x0;                     \
-        and     x15, x4, x5;                    \
-        sbcs    x6, x6, x1;                     \
-        and     x15, x15, x6;                   \
-        ldp     x0, x1, [P2+32];                \
-        sbcs    x7, x7, x0;                     \
-        and     x15, x15, x7;                   \
-        sbcs    x8, x8, x1;                     \
-        and     x15, x15, x8;                   \
-        ldp     x0, x1, [P2+48];                \
-        sbcs    x9, x9, x0;                     \
-        and     x15, x15, x9;                   \
-        sbcs    x10, x10, x1;                   \
-        and     x15, x15, x10;                  \
-        ldr     x0, [P2+64];                    \
-        eor     x0, x0, #0x1ff;                 \
-        adc     x11, x11, x0;                   \
-        lsr     x12, x11, #9;                   \
-        orr     x11, x11, #0xfffffffffffffe00;  \
-        cmp     xzr, xzr;                       \
-        adcs    xzr, x3, x12;                   \
-        adcs    xzr, x15, xzr;                  \
-        adcs    xzr, x11, xzr;                  \
-        adcs    x3, x3, x12;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adcs    x6, x6, xzr;                    \
-        adcs    x7, x7, xzr;                    \
-        adcs    x8, x8, xzr;                    \
-        adcs    x9, x9, xzr;                    \
-        adcs    x10, x10, xzr;                  \
-        adc     x11, x11, xzr;                  \
-        and     x11, x11, #0x1ff;               \
-        stp     x3, x4, [P0];                   \
-        stp     x5, x6, [P0+16];                \
-        stp     x7, x8, [P0+32];                \
-        stp     x9, x10, [P0+48];               \
-        str     x11, [P0+64]
-
-S2N_BN_SYMBOL(p521_jdouble_alt):
-
-// Save regs and make room on stack for temporary variables
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x26, [sp, #-16]!
-        stp     x27, x28, [sp, #-16]!
-        sub     sp, sp, NSPACE
-
-// Move the input arguments to stable places
-
-        mov     input_z, x0
-        mov     input_x, x1
-
-// Main code, just a sequence of basic field operations
-
-// z2 = z^2
-// y2 = y^2
-
-        sqr_p521(z2,z_1)
-        sqr_p521(y2,y_1)
-
-// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
-
-        add_p521(t1,x_1,z2)
-        sub_p521(t2,x_1,z2)
-        mul_p521(x2p,t1,t2)
-
-// t1 = y + z
-// x4p = x2p^2
-// xy2 = x * y^2
-
-        add_p521(t1,y_1,z_1)
-        sqr_p521(x4p,x2p)
-        weakmul_p521(xy2,x_1,y2)
-
-// t2 = (y + z)^2
-
-        sqr_p521(t2,t1)
-
-// d = 12 * xy2 - 9 * x4p
-// t1 = y^2 + 2 * y * z
-
-        cmsub_p521(d,12,xy2,9,x4p)
-        sub_p521(t1,t2,z2)
-
-// y4 = y^4
-
-        sqr_p521(y4,y2)
-
-// z_3' = 2 * y * z
-// dx2 = d * x2p
-
-        sub_p521(z_3,t1,y2)
-        weakmul_p521(dx2,d,x2p)
-
-// x' = 4 * xy2 - d
-
-        cmsub41_p521(x_3,xy2,d)
-
-// y' = 3 * dx2 - 8 * y4
-
-        cmsub38_p521(y_3,dx2,y4)
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-
-        ldp     x27, x28, [sp], 16
-        ldp     x25, x26, [sp], 16
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/arm/p521/p521_jmixadd_alt.S b/third_party/s2n-bignum/arm/p521/p521_jmixadd_alt.S
deleted file mode 100644
index 783ca28cf87..00000000000
--- a/third_party/s2n-bignum/arm/p521/p521_jmixadd_alt.S
+++ /dev/null
@@ -1,882 +0,0 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
-
-// ----------------------------------------------------------------------------
-// Point mixed addition on NIST curve P-521 in Jacobian coordinates
-//
-//    extern void p521_jmixadd_alt
-//      (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]);
-//
-// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
-// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
-// The "mixed" part means that p2 only has x and y coordinates, with the
-// implicit z coordinate assumed to be the identity. It is assumed that
-// all the coordinates of the input points p1 and p2 are fully reduced
-// mod p_521, that the z coordinate of p1 is nonzero and that neither
-// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine
-// point as".
-//
-// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
-// ----------------------------------------------------------------------------
-#include "_internal_s2n_bignum.h"
-
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd_alt)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd_alt)
-
-        .text
-        .balign 4
-
-// Size of individual field elements
-
-#define NUMSIZE 72
-
-// Stable homes for input arguments during main code sequence
-
-#define input_z x26
-#define input_x x27
-#define input_y x28
-
-// Pointer-offset pairs for inputs and outputs
-
-#define x_1 input_x, #0
-#define y_1 input_x, #NUMSIZE
-#define z_1 input_x, #(2*NUMSIZE)
-
-#define x_2 input_y, #0
-#define y_2 input_y, #NUMSIZE
-
-#define x_3 input_z, #0
-#define y_3 input_z, #NUMSIZE
-#define z_3 input_z, #(2*NUMSIZE)
-
-// Pointer-offset pairs for temporaries, with some aliasing
-// NSPACE is the total stack needed for these temporaries
-
-#define zp2 sp, #(NUMSIZE*0)
-#define ww sp, #(NUMSIZE*0)
-#define resx sp, #(NUMSIZE*0)
-
-#define yd sp, #(NUMSIZE*1)
-#define y2a sp, #(NUMSIZE*1)
-
-#define x2a sp, #(NUMSIZE*2)
-#define zzx2 sp, #(NUMSIZE*2)
-
-#define zz sp, #(NUMSIZE*3)
-#define t1 sp, #(NUMSIZE*3)
-
-#define t2 sp, #(NUMSIZE*4)
-#define zzx1 sp, #(NUMSIZE*4)
-#define resy sp, #(NUMSIZE*4)
-
-#define xd sp, #(NUMSIZE*5)
-#define resz sp, #(NUMSIZE*5)
-
-#define NSPACE (NUMSIZE*6)
-
-// Corresponds exactly to bignum_mul_p521_alt
-
-#define mul_p521(P0,P1,P2)                      \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        mul     x15, x3, x5;                    \
-        umulh   x16, x3, x5;                    \
-        mul     x14, x3, x6;                    \
-        umulh   x17, x3, x6;                    \
-        adds    x16, x16, x14;                  \
-        ldp     x7, x8, [P2+16];                \
-        mul     x14, x3, x7;                    \
-        umulh   x19, x3, x7;                    \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x8;                    \
-        umulh   x20, x3, x8;                    \
-        adcs    x19, x19, x14;                  \
-        ldp     x9, x10, [P2+32];               \
-        mul     x14, x3, x9;                    \
-        umulh   x21, x3, x9;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x10;                   \
-        umulh   x22, x3, x10;                   \
-        adcs    x21, x21, x14;                  \
-        ldp     x11, x12, [P2+48];              \
-        mul     x14, x3, x11;                   \
-        umulh   x23, x3, x11;                   \
-        adcs    x22, x22, x14;                  \
-        ldr     x13, [P2+64];                   \
-        mul     x14, x3, x12;                   \
-        umulh   x24, x3, x12;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x13;                   \
-        umulh   x1, x3, x13;                    \
-        adcs    x24, x24, x14;                  \
-        adc     x1, x1, xzr;                    \
-        mul     x14, x4, x5;                    \
-        adds    x16, x16, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x1, x1, x14;                    \
-        cset    x0, hs;                         \
-        umulh   x14, x4, x5;                    \
-        adds    x17, x17, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x13;                   \
-        adc     x0, x0, x14;                    \
-        stp     x15, x16, [P0];                 \
-        ldp     x3, x4, [P1+16];                \
-        mul     x14, x3, x5;                    \
-        adds    x17, x17, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x13;                   \
-        adcs    x0, x0, x14;                    \
-        cset    x15, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x19, x19, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x12;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x13;                   \
-        adc     x15, x15, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x19, x19, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x12;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x13;                   \
-        adcs    x15, x15, x14;                  \
-        cset    x16, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x20, x20, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x21, x21, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x11;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x12;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x16, x16, x14;                  \
-        stp     x17, x19, [P0+16];              \
-        ldp     x3, x4, [P1+32];                \
-        mul     x14, x3, x5;                    \
-        adds    x20, x20, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x21, x21, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x11;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x12;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x13;                   \
-        adcs    x16, x16, x14;                  \
-        cset    x17, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x21, x21, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x22, x22, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x10;                   \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x11;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x13;                   \
-        adc     x17, x17, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x21, x21, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x22, x22, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x4, x8;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x9;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x10;                   \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x11;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x17, x17, x14;                  \
-        cset    x19, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x22, x22, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x23, x23, x14;                  \
-        umulh   x14, x4, x7;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x4, x8;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x9;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x10;                   \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x19, x19, x14;                  \
-        stp     x20, x21, [P0+32];              \
-        ldp     x3, x4, [P1+48];                \
-        mul     x14, x3, x5;                    \
-        adds    x22, x22, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x23, x23, x14;                  \
-        mul     x14, x3, x7;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x3, x8;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x9;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x10;                   \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x13;                   \
-        adcs    x19, x19, x14;                  \
-        cset    x20, hs;                        \
-        umulh   x14, x3, x5;                    \
-        adds    x23, x23, x14;                  \
-        umulh   x14, x3, x6;                    \
-        adcs    x24, x24, x14;                  \
-        umulh   x14, x3, x7;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x3, x8;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x9;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x3, x13;                   \
-        adc     x20, x20, x14;                  \
-        mul     x14, x4, x5;                    \
-        adds    x23, x23, x14;                  \
-        mul     x14, x4, x6;                    \
-        adcs    x24, x24, x14;                  \
-        mul     x14, x4, x7;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x4, x8;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x4, x9;                    \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x4, x10;                   \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x4, x11;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x4, x12;                   \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x4, x13;                   \
-        adcs    x20, x20, x14;                  \
-        cset    x21, hs;                        \
-        umulh   x14, x4, x5;                    \
-        adds    x24, x24, x14;                  \
-        umulh   x14, x4, x6;                    \
-        adcs    x1, x1, x14;                    \
-        umulh   x14, x4, x7;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x4, x8;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x4, x9;                    \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x4, x10;                   \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x4, x11;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x4, x12;                   \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x4, x13;                   \
-        adc     x21, x21, x14;                  \
-        stp     x22, x23, [P0+48];              \
-        ldr     x3, [P1+64];                    \
-        mul     x14, x3, x5;                    \
-        adds    x24, x24, x14;                  \
-        mul     x14, x3, x6;                    \
-        adcs    x1, x1, x14;                    \
-        mul     x14, x3, x7;                    \
-        adcs    x0, x0, x14;                    \
-        mul     x14, x3, x8;                    \
-        adcs    x15, x15, x14;                  \
-        mul     x14, x3, x9;                    \
-        adcs    x16, x16, x14;                  \
-        mul     x14, x3, x10;                   \
-        adcs    x17, x17, x14;                  \
-        mul     x14, x3, x11;                   \
-        adcs    x19, x19, x14;                  \
-        mul     x14, x3, x12;                   \
-        adcs    x20, x20, x14;                  \
-        mul     x14, x3, x13;                   \
-        adc     x21, x21, x14;                  \
-        umulh   x14, x3, x5;                    \
-        adds    x1, x1, x14;                    \
-        umulh   x14, x3, x6;                    \
-        adcs    x0, x0, x14;                    \
-        umulh   x14, x3, x7;                    \
-        adcs    x15, x15, x14;                  \
-        umulh   x14, x3, x8;                    \
-        adcs    x16, x16, x14;                  \
-        umulh   x14, x3, x9;                    \
-        adcs    x17, x17, x14;                  \
-        umulh   x14, x3, x10;                   \
-        adcs    x19, x19, x14;                  \
-        umulh   x14, x3, x11;                   \
-        adcs    x20, x20, x14;                  \
-        umulh   x14, x3, x12;                   \
-        adc     x21, x21, x14;                  \
-        cmp     xzr, xzr;                       \
-        ldp     x5, x6, [P0];                   \
-        extr    x14, x1, x24, #9;               \
-        adcs    x5, x5, x14;                    \
-        extr    x14, x0, x1, #9;                \
-        adcs    x6, x6, x14;                    \
-        ldp     x7, x8, [P0+16];                \
-        extr    x14, x15, x0, #9;               \
-        adcs    x7, x7, x14;                    \
-        extr    x14, x16, x15, #9;              \
-        adcs    x8, x8, x14;                    \
-        ldp     x9, x10, [P0+32];               \
-        extr    x14, x17, x16, #9;              \
-        adcs    x9, x9, x14;                    \
-        extr    x14, x19, x17, #9;              \
-        adcs    x10, x10, x14;                  \
-        ldp     x11, x12, [P0+48];              \
-        extr    x14, x20, x19, #9;              \
-        adcs    x11, x11, x14;                  \
-        extr    x14, x21, x20, #9;              \
-        adcs    x12, x12, x14;                  \
-        orr     x13, x24, #0xfffffffffffffe00;  \
-        lsr     x14, x21, #9;                   \
-        adcs    x13, x13, x14;                  \
-        sbcs    x5, x5, xzr;                    \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, xzr;                  \
-        and     x13, x13, #0x1ff;               \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
-        str     x13, [P0+64]
-
-// Corresponds exactly to bignum_sqr_p521_alt
-
-#define sqr_p521(P0,P1)                         \
-        ldp     x2, x3, [P1];                   \
-        mul     x11, x2, x3;                    \
-        umulh   x12, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x10, x2, x4;                    \
-        umulh   x13, x2, x4;                    \
-        adds    x12, x12, x10;                  \
-        ldp     x6, x7, [P1+32];                \
-        mul     x10, x2, x5;                    \
-        umulh   x14, x2, x5;                    \
-        adcs    x13, x13, x10;                  \
-        ldp     x8, x9, [P1+48];                \
-        mul     x10, x2, x6;                    \
-        umulh   x15, x2, x6;                    \
-        adcs    x14, x14, x10;                  \
-        mul     x10, x2, x7;                    \
-        umulh   x16, x2, x7;                    \
-        adcs    x15, x15, x10;                  \
-        mul     x10, x2, x8;                    \
-        umulh   x17, x2, x8;                    \
-        adcs    x16, x16, x10;                  \
-        mul     x10, x2, x9;                    \
-        umulh   x19, x2, x9;                    \
-        adcs    x17, x17, x10;                  \
-        adc     x19, x19, xzr;                  \
-        mul     x10, x3, x4;                    \
-        adds    x13, x13, x10;                  \
-        mul     x10, x3, x5;                    \
-        adcs    x14, x14, x10;                  \
-        mul     x10, x3, x6;                    \
-        adcs    x15, x15, x10;                  \
-        mul     x10, x3, x7;                    \
-        adcs    x16, x16, x10;                  \
-        mul     x10, x3, x8;                    \
-        adcs    x17, x17, x10;                  \
-        mul     x10, x3, x9;                    \
-        adcs    x19, x19, x10;                  \
-        cset    x20, hs;                        \
-        umulh   x10, x3, x4;                    \
-        adds    x14, x14, x10;                  \
-        umulh   x10, x3, x5;                    \
-        adcs    x15, x15, x10;                  \
-        umulh   x10, x3, x6;                    \
-        adcs    x16, x16, x10;                  \
-        umulh   x10, x3, x7;                    \
-        adcs    x17, x17, x10;                  \
-        umulh   x10, x3, x8;                    \
-        adcs    x19, x19, x10;                  \
-        umulh   x10, x3, x9;                    \
-        adc     x20, x20, x10;                  \
-        mul     x10, x6, x7;                    \
-        umulh   x21, x6, x7;                    \
-        adds    x20, x20, x10;                  \
-        adc     x21, x21, xzr;                  \
-        mul     x10, x4, x5;                    \
-        adds    x15, x15, x10;                  \
-        mul     x10, x4, x6;                    \
-        adcs    x16, x16, x10;                  \
-        mul     x10, x4, x7;                    \
-        adcs    x17, x17, x10;                  \
-        mul     x10, x4, x8;                    \
-        adcs    x19, x19, x10;                  \
-        mul     x10, x4, x9;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x6, x8;                    \
-        adcs    x21, x21, x10;                  \
-        cset    x22, hs;                        \
-        umulh   x10, x4, x5;                    \
-        adds    x16, x16, x10;                  \
-        umulh   x10, x4, x6;                    \
-        adcs    x17, x17, x10;                  \
-        umulh   x10, x4, x7;                    \
-        adcs    x19, x19, x10;                  \
-        umulh   x10, x4, x8;                    \
-        adcs    x20, x20, x10;                  \
-        umulh   x10, x4, x9;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x6, x8;                    \
-        adc     x22, x22, x10;                  \
-        mul     x10, x7, x8;                    \
-        umulh   x23, x7, x8;                    \
-        adds    x22, x22, x10;                  \
-        adc     x23, x23, xzr;                  \
-        mul     x10, x5, x6;                    \
-        adds    x17, x17, x10;                  \
-        mul     x10, x5, x7;                    \
-        adcs    x19, x19, x10;                  \
-        mul     x10, x5, x8;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x5, x9;                    \
-        adcs    x21, x21, x10;                  \
-        mul     x10, x6, x9;                    \
-        adcs    x22, x22, x10;                  \
-        mul     x10, x7, x9;                    \
-        adcs    x23, x23, x10;                  \
-        cset    x24, hs;                        \
-        umulh   x10, x5, x6;                    \
-        adds    x19, x19, x10;                  \
-        umulh   x10, x5, x7;                    \
-        adcs    x20, x20, x10;                  \
-        umulh   x10, x5, x8;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x5, x9;                    \
-        adcs    x22, x22, x10;                  \
-        umulh   x10, x6, x9;                    \
-        adcs    x23, x23, x10;                  \
-        umulh   x10, x7, x9;                    \
-        adc     x24, x24, x10;                  \
-        mul     x10, x8, x9;                    \
-        umulh   x25, x8, x9;                    \
-        adds    x24, x24, x10;                  \
-        adc     x25, x25, xzr;                  \
-        adds    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        adcs    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adcs    x17, x17, x17;                  \
-        adcs    x19, x19, x19;                  \
-        adcs    x20, x20, x20;                  \
-        adcs    x21, x21, x21;                  \
-        adcs    x22, x22, x22;                  \
-        adcs    x23, x23, x23;                  \
-        adcs    x24, x24, x24;                  \
-        adcs    x25, x25, x25;                  \
-        cset    x0, hs;                         \
-        umulh   x10, x2, x2;                    \
-        adds    x11, x11, x10;                  \
-        mul     x10, x3, x3;                    \
-        adcs    x12, x12, x10;                  \
-        umulh   x10, x3, x3;                    \
-        adcs    x13, x13, x10;                  \
-        mul     x10, x4, x4;                    \
-        adcs    x14, x14, x10;                  \
-        umulh   x10, x4, x4;                    \
-        adcs    x15, x15, x10;                  \
-        mul     x10, x5, x5;                    \
-        adcs    x16, x16, x10;                  \
-        umulh   x10, x5, x5;                    \
-        adcs    x17, x17, x10;                  \
-        mul     x10, x6, x6;                    \
-        adcs    x19, x19, x10;                  \
-        umulh   x10, x6, x6;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x7, x7;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x7, x7;                    \
-        adcs    x22, x22, x10;                  \
-        mul     x10, x8, x8;                    \
-        adcs    x23, x23, x10;                  \
-        umulh   x10, x8, x8;                    \
-        adcs    x24, x24, x10;                  \
-        mul     x10, x9, x9;                    \
-        adcs    x25, x25, x10;                  \
-        umulh   x10, x9, x9;                    \
-        adc     x0, x0, x10;                    \
-        ldr     x1, [P1+64];                    \
-        add     x1, x1, x1;                     \
-        mul     x10, x1, x2;                    \
-        adds    x19, x19, x10;                  \
-        umulh   x10, x1, x2;                    \
-        adcs    x20, x20, x10;                  \
-        mul     x10, x1, x4;                    \
-        adcs    x21, x21, x10;                  \
-        umulh   x10, x1, x4;                    \
-        adcs    x22, x22, x10;                  \
-        mul     x10, x1, x6;                    \
-        adcs    x23, x23, x10;                  \
-        umulh   x10, x1, x6;                    \
-        adcs    x24, x24, x10;                  \
-        mul     x10, x1, x8;                    \
-        adcs    x25, x25, x10;                  \
-        umulh   x10, x1, x8;                    \
-        adcs    x0, x0, x10;                    \
-        lsr     x4, x1, #1;                     \
-        mul     x4, x4, x4;                     \
-        adc     x4, x4, xzr;                    \
-        mul     x10, x1, x3;                    \
-        adds    x20, x20, x10;                  \
-        umulh   x10, x1, x3;                    \
-        adcs    x21, x21, x10;                  \
-        mul     x10, x1, x5;                    \
-        adcs    x22, x22, x10;                  \
-        umulh   x10, x1, x5;                    \
-        adcs    x23, x23, x10;                  \
-        mul     x10, x1, x7;                    \
-        adcs    x24, x24, x10;                  \
-        umulh   x10, x1, x7;                    \
-        adcs    x25, x25, x10;                  \
-        mul     x10, x1, x9;                    \
-        adcs    x0, x0, x10;                    \
-        umulh   x10, x1, x9;                    \
-        adc     x4, x4, x10;                    \
-        mul     x2, x2, x2;                     \
-        cmp     xzr, xzr;                       \
-        extr    x10, x20, x19, #9;              \
-        adcs    x2, x2, x10;                    \
-        extr    x10, x21, x20, #9;              \
-        adcs    x11, x11, x10;                  \
-        extr    x10, x22, x21, #9;              \
-        adcs    x12, x12, x10;                  \
-        extr    x10, x23, x22, #9;              \
-        adcs    x13, x13, x10;                  \
-        extr    x10, x24, x23, #9;              \
-        adcs    x14, x14, x10;                  \
-        extr    x10, x25, x24, #9;              \
-        adcs    x15, x15, x10;                  \
-        extr    x10, x0, x25, #9;               \
-        adcs    x16, x16, x10;                  \
-        extr    x10, x4, x0, #9;                \
-        adcs    x17, x17, x10;                  \
-        orr     x19, x19, #0xfffffffffffffe00;  \
-        lsr     x10, x4, #9;                    \
-        adcs    x19, x19, x10;                  \
-        sbcs    x2, x2, xzr;                    \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbcs    x15, x15, xzr;                  \
-        sbcs    x16, x16, xzr;                  \
-        sbcs    x17, x17, xzr;                  \
-        sbc     x19, x19, xzr;                  \
-        and     x19, x19, #0x1ff;               \
-        stp     x2, x11, [P0];                  \
-        stp     x12, x13, [P0+16];              \
-        stp     x14, x15, [P0+32];              \
-        stp     x16, x17, [P0+48];              \
-        str     x19, [P0+64]
-
-// Corresponds exactly to bignum_sub_p521
-
-#define sub_p521(P0,P1,P2)                      \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        ldp     x11, x12, [P1+48];              \
-        ldp     x4, x3, [P2+48];                \
-        sbcs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        ldr     x13, [P1+64];                   \
-        ldr     x4, [P2+64];                    \
-        sbcs    x13, x13, x4;                   \
-        sbcs    x5, x5, xzr;                    \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        and     x13, x13, #0x1ff;               \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
-        str     x13, [P0+64]
-
-S2N_BN_SYMBOL(p521_jmixadd_alt):
-
-// Save regs and make room on stack for temporary variables
-
-        stp     x19, x20, [sp, #-16]!
-        stp     x21, x22, [sp, #-16]!
-        stp     x23, x24, [sp, #-16]!
-        stp     x25, x26, [sp, #-16]!
-        stp     x27, x28, [sp, #-16]!
-        sub     sp, sp, NSPACE
-
-// Move the input arguments to stable places
-
-        mov     input_z, x0
-        mov     input_x, x1
-        mov     input_y, x2
-
-// Main code, just a sequence of basic field operations
-
-        sqr_p521(zp2,z_1)
-        mul_p521(y2a,z_1,y_2)
-
-        mul_p521(x2a,zp2,x_2)
-        mul_p521(y2a,zp2,y2a)
-
-        sub_p521(xd,x2a,x_1)
-        sub_p521(yd,y2a,y_1)
-
-        sqr_p521(zz,xd)
-        sqr_p521(ww,yd)
-
-        mul_p521(zzx1,zz,x_1)
-        mul_p521(zzx2,zz,x2a)
-
-        sub_p521(resx,ww,zzx1)
-        sub_p521(t1,zzx2,zzx1)
-
-        mul_p521(resz,xd,z_1)
-
-        sub_p521(resx,resx,zzx2)
-
-        sub_p521(t2,zzx1,resx)
-
-        mul_p521(t1,t1,y_1)
-        mul_p521(t2,yd,t2)
-
-        sub_p521(resy,t2,t1)
-
-// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
-
-        ldp     x0, x1, [z_1]
-        orr     x0, x0, x1
-        ldp     x2, x3, [z_1+16]
-        orr     x2, x2, x3
-        ldp     x4, x5, [z_1+32]
-        orr     x4, x4, x5
-        ldp     x6, x7, [z_1+48]
-        orr     x6, x6, x7
-        ldr     x8, [z_1+64]
-        orr     x0, x0, x2
-        orr     x4, x4, x6
-        orr     x0, x0, x4
-        orr     x0, x0, x8
-        cmp     x0, xzr
-
-// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
-// If p1 = 0 then return the point p2 augmented with an extra z = 1
-// coordinate, hence giving 0 + p2 = p2 for the final result.
-
-        ldp     x0, x1, [resx]
-        ldp     x20, x21, [x_2]
-        csel    x0, x0, x20, ne
-        csel    x1, x1, x21, ne
-        ldp     x2, x3, [resx+16]
-        ldp     x20, x21, [x_2+16]
-        csel    x2, x2, x20, ne
-        csel    x3, x3, x21, ne
-        ldp     x4, x5, [resx+32]
-        ldp     x20, x21, [x_2+32]
-        csel    x4, x4, x20, ne
-        csel    x5, x5, x21, ne
-        ldp     x6, x7, [resx+48]
-        ldp     x20, x21, [x_2+48]
-        csel    x6, x6, x20, ne
-        csel    x7, x7, x21, ne
-        ldr     x8, [resx+64]
-        ldr     x20, [x_2+64]
-        csel    x8, x8, x20, ne
-
-        ldp     x10, x11, [resy]
-        ldp     x20, x21, [y_2]
-        csel    x10, x10, x20, ne
-        csel    x11, x11, x21, ne
-        ldp     x12, x13, [resy+16]
-        ldp     x20, x21, [y_2+16]
-        csel    x12, x12, x20, ne
-        csel    x13, x13, x21, ne
-        ldp     x14, x15, [resy+32]
-        ldp     x20, x21, [y_2+32]
-        csel    x14, x14, x20, ne
-        csel    x15, x15, x21, ne
-        ldp     x16, x17, [resy+48]
-        ldp     x20, x21, [y_2+48]
-        csel    x16, x16, x20, ne
-        csel    x17, x17, x21, ne
-        ldr     x19, [resy+64]
-        ldr     x20, [y_2+64]
-        csel    x19, x19, x20, ne
-
-        stp     x0, x1, [x_3]
-        stp     x2, x3, [x_3+16]
-        stp     x4, x5, [x_3+32]
-        stp     x6, x7, [x_3+48]
-        str     x8, [x_3+64]
-        stp     x10, x11, [y_3]
-        stp     x12, x13, [y_3+16]
-        stp     x14, x15, [y_3+32]
-        stp     x16, x17, [y_3+48]
-        str     x19, [y_3+64]
-
-        ldp     x0, x1, [resz]
-        mov     x20, #1
-        csel    x0, x0, x20, ne
-        csel    x1, x1, xzr, ne
-        ldp     x2, x3, [resz+16]
-        csel    x2, x2, xzr, ne
-        csel    x3, x3, xzr, ne
-        ldp     x4, x5, [resz+32]
-        csel    x4, x4, xzr, ne
-        csel    x5, x5, xzr, ne
-        ldp     x6, x7, [resz+48]
-        csel    x6, x6, xzr, ne
-        csel    x7, x7, xzr, ne
-        ldr     x8, [resz+64]
-        csel    x8, x8, xzr, ne
-
-        stp     x0, x1, [z_3]
-        stp     x2, x3, [z_3+16]
-        stp     x4, x5, [z_3+32]
-        stp     x6, x7, [z_3+48]
-        str     x8, [z_3+64]
-
-// Restore stack and registers
-
-        add     sp, sp, NSPACE
-
-        ldp     x27, x28, [sp], 16
-        ldp     x25, x26, [sp], 16
-        ldp     x23, x24, [sp], 16
-        ldp     x21, x22, [sp], 16
-        ldp     x19, x20, [sp], 16
-
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits
-#endif
diff --git a/third_party/s2n-bignum/import.sh b/third_party/s2n-bignum/import.sh
new file mode 100755
index 00000000000..4a2bb1b638d
--- /dev/null
+++ b/third_party/s2n-bignum/import.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC
+
+# https://github.com/awslabs/s2n-bignum -> AWS-LC importer script
+#
+# This script imports a version of s2n-bignum source into AWS-LC.
+#
+# Usage:
+#
+# ```
+# rm -rf ./s2n-bignum-imported
+# ./import.sh
+# ```
+#
+# This imports s2n-bignum from https://github.com/awslabs/s2n-bignum
+# and leaves import meta data in META.yml.
+#
+# If you want to import a specific branch/tag or from a specific repository
+# either set GITHUB_TARGET or GITHUB_REPOSITORY as below:
+#
+# ```
+# GITHUB_REPOSITORY=<repo owner>/<repo name> GITHUB_TARGET=<branch or tag> ./import.sh
+# ```
+
+GITHUB_SERVER_URL="https://github.com/"
+GITHUB_REPOSITORY=${GITHUB_REPOSITORY:=awslabs/s2n-bignum.git}
+GITHUB_TARGET=${GITHUB_TARGET:=main}
+
+SRC="s2n-bignum-imported"
+TMP="TEMP_CAN_DELETE"
+
+# Check if TMP directory already exists
+if [ -d "${TMP}" ]; then
+  echo "Source directory or symlink ${TMP} does already exist -- please remove it before re-running the importer"
+  exit 1
+fi
+
+# Check if source directory already exists
+if [ -d "${SRC}" ]; then
+  echo "Source directory or symlink ${SRC} does already exist -- please remove it before re-running the importer"
+  exit 1
+fi
+
+mkdir ${TMP}
+
+echo "Fetching repository ..."
+git clone ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY} ${TMP} --branch ${GITHUB_TARGET} --single-branch > /dev/null
+GITHUB_COMMIT=$(cd ${TMP} >/dev/null; git rev-parse HEAD)
+
+echo "Cloned s2n-bignum folder"
+ls -la ${TMP}
+
+echo "Remove source code from s2n-bignum that is not needed..."
+code_not_needed=("benchmarks" "codebuild" "common" "tests" "tools" "x86" "arm/proofs")
+for code in "${code_not_needed[@]}"; do
+  rm -rf ${TMP}/${code}
+done
+
+echo "Cloned s2n-bignum folder after removing unneeded source code..."
+ls -la ${TMP}
+
+echo "Copy source code ..."
+mkdir ${SRC}
+cp -rH ${TMP}/* ${SRC}
+
+echo "Copied s2n-bignum source code..."
+ls -la ${SRC}
+
+echo "Remove temporary artifacts ..."
+rm -rf ${TMP}
+
+echo "Generating META.yml file ..."
+cat <<EOF > META.yml
+name: ${SRC}
+source: ${GITHUB_REPOSITORY}
+commit: ${GITHUB_COMMIT}
+target: ${GITHUB_TARGET}
+imported-at: $(env TZ=UTC date "+%Y-%m-%dT%H:%M:%S%z")
+EOF
+
+# Submodule path might be cached.
+echo ""
+echo "Post actions: Run"
+echo "$ git add ${SRC} META.yml ; git commit -m \"Imported s2n-bignum version: ${GITHUB_TARGET}/${GITHUB_COMMIT}\""
+echo "to add new source to git tree"
diff --git a/third_party/s2n-bignum/include/_internal_s2n_bignum.h b/third_party/s2n-bignum/include/_internal_s2n_bignum.h
deleted file mode 100644
index c7cedb633a4..00000000000
--- a/third_party/s2n-bignum/include/_internal_s2n_bignum.h
+++ /dev/null
@@ -1,17 +0,0 @@
-
-#ifdef __APPLE__
-#   define S2N_BN_SYMBOL(NAME) _##NAME
-#else
-#   define S2N_BN_SYMBOL(name) name
-#endif
-
-#define S2N_BN_SYM_VISIBILITY_DIRECTIVE(name) .globl S2N_BN_SYMBOL(name)
-#ifdef S2N_BN_HIDE_SYMBOLS
-#   ifdef __APPLE__
-#      define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) .private_extern S2N_BN_SYMBOL(name)
-#   else
-#      define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) .hidden S2N_BN_SYMBOL(name)
-#   endif
-#else
-#   define S2N_BN_SYM_PRIVACY_DIRECTIVE(name)  /* NO-OP: S2N_BN_SYM_PRIVACY_DIRECTIVE */
-#endif
\ No newline at end of file
diff --git a/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h b/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h
deleted file mode 100644
index 186029bf08f..00000000000
--- a/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h
+++ /dev/null
@@ -1,433 +0,0 @@
-/*
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License").
- * You may not use this file except in compliance with the License.
- * A copy of the License is located at
- *
- *  http://aws.amazon.com/apache2.0
- *
- * or in the "LICENSE" file accompanying this file. This file is distributed
- * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
- * express or implied. See the License for the specific language governing
- * permissions and limitations under the License.
- */
-#ifndef S2N_BIGNUM_AWS_LC_H
-#define S2N_BIGNUM_AWS_LC_H
-
-#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__)
-  #define S2N_BIGNUM_STATIC
-#else
-  #define S2N_BIGNUM_STATIC static
-#endif
-
-// ----------------------------------------------------------------------------
-// C prototypes for s2n-bignum functions used in AWS-LC
-// ----------------------------------------------------------------------------
-
-// For some functions there are additional variants with names ending in
-// "_alt". These have the same core mathematical functionality as their
-// non-"alt" versions, but can be better suited to some microarchitectures:
-//
-//      - On x86, the "_alt" forms avoid BMI and ADX instruction set
-//        extensions, so will run on any x86_64 machine, even older ones
-//
-//      - On ARM, the "_alt" forms target machines with higher multiplier
-//        throughput, generally offering higher performance there.
-// For each of those, we define a _selector function that selects, in runtime,
-// the _alt or non-_alt version to run.
-
-#if defined(OPENSSL_X86_64)
-// On x86_64 platforms s2n-bignum uses bmi2 and adx instruction sets
-// for some of the functions. These instructions are not supported by
-// every x86 CPU so we have to check if they are available and in case
-// they are not we fallback to slightly slower but generic implementation.
-static inline uint8_t use_s2n_bignum_alt(void) {
-  return (!CRYPTO_is_BMI2_capable() || !CRYPTO_is_ADX_capable());
-}
-#else
-// On aarch64 platforms s2n-bignum has two implementations of certain
-// functions -- the default one and the alternative (suffixed _alt).
-// Depending on the architecture one version is faster than the other.
-// Generally, the "_alt" functions are faster on architectures with higher
-// multiplier throughput, for example, Graviton 3, Apple's M1 and iPhone chips.
-static inline uint8_t use_s2n_bignum_alt(void) {
-  return CRYPTO_is_ARMv8_wide_multiplier_capable();
-}
-#endif
-
-extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]);
-extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]);
-static inline void p256_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]) {
-  if (use_s2n_bignum_alt()) { p256_montjscalarmul_alt(res, scalar, point); }
-  else { p256_montjscalarmul(res, scalar, point); }
-}
-
-// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
-//   z = x^-1 mod p_256.
-// The function is constant-time.
-extern void bignum_montinv_p256(uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
-
-// Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
-// Inputs x[6], y[6]; output z[6]
-extern void bignum_add_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-
-// Convert from almost-Montgomery form, z := (x / 2^384) mod p_384
-// Input x[6]; output z[6]
-extern void bignum_deamont_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_deamont_p384_alt(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-static inline void bignum_deamont_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) {
-  if (use_s2n_bignum_alt()) { bignum_deamont_p384_alt(z, x); }
-  else { bignum_deamont_p384(z, x); }
-}
-
-// Montgomery multiply, z := (x * y / 2^384) mod p_384 
-// Inputs x[6], y[6]; output z[6]
-extern void bignum_montmul_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-extern void bignum_montmul_p384_alt(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-static inline void bignum_montmul_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]) {
-  if (use_s2n_bignum_alt()) { bignum_montmul_p384_alt(z, x, y); }
-  else { bignum_montmul_p384(z, x, y); }
-}
-
-// Montgomery square, z := (x^2 / 2^384) mod p_384
-// Input x[6]; output z[6]
-extern void bignum_montsqr_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_montsqr_p384_alt(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-static inline void bignum_montsqr_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) {
-  if (use_s2n_bignum_alt()) { bignum_montsqr_p384_alt(z, x); }
-  else { bignum_montsqr_p384(z, x); }
-}
-
-// Negate modulo p_384, z := (-x) mod p_384, assuming x reduced
-// Input x[6]; output z[6]
-extern void bignum_neg_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-
-// Subtract modulo p_384, z := (x - y) mod p_384
-// Inputs x[6], y[6]; output z[6]
-extern void bignum_sub_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
-
-// Convert to Montgomery form z := (2^384 * x) mod p_384 */
-// Input x[6]; output z[6] */
-extern void bignum_tomont_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-extern void bignum_tomont_p384_alt(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-static inline void bignum_tomont_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) {
-  if (use_s2n_bignum_alt()) { bignum_tomont_p384_alt(z, x); }
-  else { bignum_tomont_p384(z, x); }
-}
-extern void p384_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 18],uint64_t p1[S2N_BIGNUM_STATIC 18]);
-extern void p384_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],uint64_t p1[S2N_BIGNUM_STATIC 18]);
-static inline void p384_montjdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 18],uint64_t p1[S2N_BIGNUM_STATIC 18]) {
-    if (use_s2n_bignum_alt()) { p384_montjdouble_alt(p3, p1); }
-    else { p384_montjdouble(p3, p1); }
-}
-
-extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]);
-extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]);
-static inline void p384_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]) {
-  if (use_s2n_bignum_alt()) { p384_montjscalarmul_alt(res, scalar, point); }
-  else { p384_montjscalarmul(res, scalar, point); }
-}
-
-// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
-//   z = x^-1 mod p_384.
-// The function is constant-time.
-extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-
-// Convert 6-digit (384-bit) bignum from little-endian form
-// Input x[6]; output z[6]
-extern void bignum_fromlebytes_6(uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
-
-// Convert 6-digit (384-bit) bignum to little-endian form
-// Input x[6]; output z[6]
-extern void bignum_tolebytes_6(uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
-
-// 384-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero
-// Input x[6]; output function return
-extern uint64_t bignum_nonzero_6(const uint64_t x[S2N_BIGNUM_STATIC 6]);
-
-// Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced
-// Inputs x[9], y[9]; output z[9]
-extern void bignum_add_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-
-// Subtract modulo p_521, z := (x - y) mod p_521
-// Inputs x[9], y[9]; output z[9]
-extern void bignum_sub_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-
-// Negate modulo p_521, z := (-x) mod p_521, assuming x reduced
-// Input x[9]; output z[9]
-extern void bignum_neg_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-
-// Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced
-// Inputs x[9], y[9]; output z[9]
-extern void bignum_mul_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-extern void bignum_mul_p521_alt(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
-static inline void bignum_mul_p521_selector(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]) {
-  if (use_s2n_bignum_alt()) { bignum_mul_p521_alt(z, x, y); }
-  else { bignum_mul_p521(z, x, y); }
-}
-
-// Square modulo p_521, z := (x^2) mod p_521, assuming x reduced
-// Input x[9]; output z[9]
-extern void bignum_sqr_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-extern void bignum_sqr_p521_alt(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-static inline void bignum_sqr_p521_selector(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]) {
-  if (use_s2n_bignum_alt()) { bignum_sqr_p521_alt(z, x); }
-  else { bignum_sqr_p521(z, x); }
-}
-
-// Convert little-endian bytes to 9-digit 528-bit bignum
-extern void bignum_fromlebytes_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint8_t x[S2N_BIGNUM_STATIC 66]);
-
-// Convert 9-digit 528-bit bignum to little-endian bytes
-extern void bignum_tolebytes_p521(uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-
-extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]);
-extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]);
-static inline void p521_jdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]) {
-    if (use_s2n_bignum_alt()) { p521_jdouble_alt(p3, p1); }
-    else { p521_jdouble(p3, p1); }
-}
-extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]);
-extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]);
-static inline void p521_jscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]) {
-    if (use_s2n_bignum_alt()) { p521_jscalarmul_alt(res, scalar, point); }
-    else { p521_jscalarmul(res, scalar, point); }
-}
-
-// Modular inverse modulo p_521 =  2^521 - 1
-//   z = x^-1 mod p_521.
-// The function is constant-time.
-extern void bignum_inv_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
-
-// curve25519_x25519_byte and curve25519_x25519_byte_alt computes the x25519
-// function specified in https://www.rfc-editor.org/rfc/rfc7748. |scalar| is the
-// scalar, |point| is the u-coordinate of the elliptic curve
-// point. The result, another u-coordinate, is saved in |res|.
-extern void curve25519_x25519_byte(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32], const uint8_t point[S2N_BIGNUM_STATIC 32]);
-extern void curve25519_x25519_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32], const uint8_t point[S2N_BIGNUM_STATIC 32]);
-static inline void curve25519_x25519_byte_selector(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32], const uint8_t point[S2N_BIGNUM_STATIC 32]) {
-  if (use_s2n_bignum_alt()) { curve25519_x25519_byte_alt(res, scalar, point); }
-  else { curve25519_x25519_byte(res, scalar, point); }
-}
-
-// curve25519_x25519base_byte and curve25519_x25519base_byte_alt computes the
-// x25519 function specified in https://www.rfc-editor.org/rfc/rfc7748 using the
-// basepoint specified in section 4.1. |scalar| is the scalar. The result,
-// another u-coordinate, is saved in |res|.
-extern void curve25519_x25519base_byte(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
-extern void curve25519_x25519base_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
-static inline void curve25519_x25519base_byte_selector(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32]) {
-  if (use_s2n_bignum_alt()) { curve25519_x25519base_byte_alt(res, scalar); }
-  else { curve25519_x25519base_byte(res, scalar); }
-}
-
-// Evaluate z := x^2 where x is a 2048-bit integer.
-// Input: x[32]; output: z[64]; temporary buffer: t[>=72]
-#define S2NBIGNUM_KSQR_32_64_TEMP_NWORDS 72
-extern void
-bignum_ksqr_32_64(uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32],
-                  uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KSQR_32_64_TEMP_NWORDS]);
-extern void
-bignum_ksqr_32_64_neon(uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32],
-                       uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KSQR_32_64_TEMP_NWORDS]);
-
-// Evaluate z := x^2 where x is a 1024-bit integer.
-// Input: x[16]; output: z[32]; temporary buffer: t[>=24]
-#define S2NBIGNUM_KSQR_16_32_TEMP_NWORDS 24
-extern void
-bignum_ksqr_16_32(uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16],
-                  uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KSQR_16_32_TEMP_NWORDS]);
-extern void
-bignum_ksqr_16_32_neon(uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16],
-                       uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KSQR_16_32_TEMP_NWORDS]);
-
-// Evaluate z := x * y where x and y are 2048-bit integers.
-// Inputs: x[32], y[32]; output: z[64]; temporary buffer t[>=96]
-#define S2NBIGNUM_KMUL_32_64_TEMP_NWORDS 96
-extern void
-bignum_kmul_32_64(uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32],
-                  const uint64_t y[S2N_BIGNUM_STATIC 32],
-                  uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KMUL_32_64_TEMP_NWORDS]);
-extern void
-bignum_kmul_32_64_neon(uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32],
-                       const uint64_t y[S2N_BIGNUM_STATIC 32],
-                       uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KMUL_32_64_TEMP_NWORDS]);
-
-// Evaluate z := x * y where x and y are 1024-bit integers.
-// Inputs: x[16], y[16]; output: z[32]; temporary buffer t[>=32]
-#define S2NBIGNUM_KMUL_16_32_TEMP_NWORDS 32
-extern void
-bignum_kmul_16_32(uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16],
-                  const uint64_t y[S2N_BIGNUM_STATIC 16],
-                  uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KMUL_16_32_TEMP_NWORDS]);
-extern void
-bignum_kmul_16_32_neon(uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16],
-                       const uint64_t y[S2N_BIGNUM_STATIC 16],
-                       uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KMUL_16_32_TEMP_NWORDS]);
-
-// Extended Montgomery reduce in 8-digit blocks.
-// Assumes that z initially holds a 2k-digit bignum z_0, m is a k-digit odd
-// bignum and m * w == -1 (mod 2^64). This function also uses z for the output
-// as well as returning a carry c of 0 or 1. This encodes two numbers: in the
-// lower half of the z buffer we have q = z[0..k-1], while the upper half
-// together with the carry gives r = 2^{64k}*c + z[k..2k-1]. These values
-// satisfy z_0 + q * m = 2^{64k} * r, i.e. r gives a raw (unreduced) Montgomery
-// reduction while q gives the multiplier that was used.
-// Note that q = (z_0 mod 2^{64k}) * (-m^-1 mod 2^{64k}) mod 2^{64k}.
-//    z_0 + q * m = 0           mod 2^{64k}
-//          q * m = -z_0        mod 2^{64k}
-//          q     = -z_0 * m^-1 mod 2^{64k}
-//                = (z_0 mod 2^{64k}) * (-m^-1 mod 2^{64k}) mod 2^{64k}
-// q is uniquely determined because q must be in the range of [0, 2^{64k}-1].
-// Inputs: z[2*k], m[k], w; outputs: function return (extra result bit) and z[2*k]
-extern uint64_t bignum_emontredc_8n(uint64_t k, uint64_t *z, const uint64_t *m,
-                                    uint64_t w);
-extern uint64_t bignum_emontredc_8n_neon(uint64_t k, uint64_t *z, const uint64_t *m,
-                                         uint64_t w);
-
-// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
-// Inputs: x[k], p, y[k]; outputs: function return (carry-out) and z[k]
-extern uint64_t bignum_optsub(uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p,
-                              const uint64_t *y);
-
-// Compare bignums, x >= y.
-// Inputs: x[m], y[n]; output: function return (1 if x >= y)
-extern uint64_t bignum_ge(uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
-
-// General big-integer multiplication (z := x * y).
-// Inputs: x[m], y[n]; output: z[k]. If k < m+n, the result is truncated.
-extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x,
-                       uint64_t n, const uint64_t *y);
-
-// General big-integer squaring (z := x^2).
-// Inputs: x[m]; output: z[k]. If k < 2m, the result is truncated.
-extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x);
-
-// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
-// into z[0..row-1].
-// This function is constant-time with respect to the value of `idx`. This is
-// achieved by reading the whole table and using the bit-masking to get the
-// `idx`-th row.
-// Input table[height*width]; output z[width]
-extern void bignum_copy_row_from_table (uint64_t *z, const uint64_t *table,
-        uint64_t height, uint64_t width, uint64_t idx);
-
-// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
-// into z[0..row-1]. width must be a multiple of 8.
-// This function is constant-time with respect to the value of `idx`. This is
-// achieved by reading the whole table and using the bit-masking to get the
-// `idx`-th row.
-// Input table[height*width]; output z[width]
-extern void bignum_copy_row_from_table_8n_neon (uint64_t *z, const uint64_t *table,
-        uint64_t height, uint64_t width, uint64_t idx);
-
-// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1].
-// This function is constant-time with respect to the value of `idx`. This is
-// achieved by reading the whole table and using the bit-masking to get the
-// `idx`-th row.
-// Input table[height*16]; output z[16]
-extern void bignum_copy_row_from_table_16_neon (uint64_t *z, const uint64_t *table,
-        uint64_t height, uint64_t idx);
-
-// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1].
-// This function is constant-time with respect to the value of `idx`. This is
-// achieved by reading the whole table and using the bit-masking to get the
-// `idx`-th row.
-// Input table[height*32]; output z[32]
-extern void bignum_copy_row_from_table_32_neon (uint64_t *z, const uint64_t *table,
-        uint64_t height, uint64_t idx);
-
-// Reduction is modulo the order of the curve25519/edwards25519 basepoint,
-// which is n_25519 = 2^252 + 27742317777372353535851937790883648493.
-// Reduce modulo basepoint order, z := x mod n_25519
-// Input x[k]; output z[4]
-extern void bignum_mod_n25519(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, uint64_t *x);
-
-// Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced
-// Input x[4]; output z[4]
-extern void bignum_neg_p25519(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4]);
-
-// Performs z := (x * y + c) mod n_25519, where the modulus is
-// n_25519 = 2^252 + 27742317777372353535851937790883648493, the
-// order of the curve25519/edwards25519 basepoint. The result z
-// and the inputs x, y and c are all 4 digits (256 bits).
-// Inputs x[4], y[4], c[4]; output z[4]
-extern void bignum_madd_n25519(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4],
-        uint64_t y[S2N_BIGNUM_STATIC 4], uint64_t c[S2N_BIGNUM_STATIC 4]);
-extern void bignum_madd_n25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4],
-        uint64_t y[S2N_BIGNUM_STATIC 4], uint64_t c[S2N_BIGNUM_STATIC 4]);
-static inline void bignum_madd_n25519_selector(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4], uint64_t y[S2N_BIGNUM_STATIC 4], uint64_t c[S2N_BIGNUM_STATIC 4]) {
-  if (use_s2n_bignum_alt()) { bignum_madd_n25519_alt(z, x, y, c); }
-  else { bignum_madd_n25519(z, x, y, c); }
-}
-
-// This assumes that the input buffer p points to a pair of 256-bit
-// numbers x (at p) and y (at p+4) representing a point (x,y) on the
-// edwards25519 curve. It is assumed that both x and y are < p_25519
-// but there is no checking of this, nor of the fact that (x,y) is
-// in fact on the curve.
-//
-// The output in z is a little-endian array of bytes corresponding to
-// the standard compressed encoding of a point as 2^255 * x_0 + y
-// where x_0 is the least significant bit of x.
-// See "https://datatracker.ietf.org/doc/html/rfc8032#section-5.1.2"
-// In this implementation, y is simply truncated to 255 bits, but if
-// it is reduced mod p_25519 as expected this does not affect values.
-extern void edwards25519_encode(uint8_t z[S2N_BIGNUM_STATIC 32], uint64_t p[S2N_BIGNUM_STATIC 8]);
-
-// This interprets the input byte string as a little-endian number
-// representing a point (x,y) on the edwards25519 curve, encoded as
-// 2^255 * x_0 + y where x_0 is the least significant bit of x. It
-// returns the full pair of coordinates x (at z) and y (at z+4). The
-// return code is 0 for success and 1 for failure, which means that
-// the input does not correspond to the encoding of any edwards25519
-// point. This can happen for three reasons, where y = the lowest
-// 255 bits of the input:
-//
-//  * y >= p_25519
-//    Input y coordinate is not reduced
-//  * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root
-//    There is no x such that (x,y) is on the curve
-//  * y^2 = 1 and top bit of input is set
-//    Cannot be the canonical encoding of (0,1) or (0,-1)
-//
-// Input c[32] (bytes); output function return and z[8]
-extern uint64_t edwards25519_decode(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
-extern uint64_t edwards25519_decode_alt(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
-static inline uint64_t edwards25519_decode_selector(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]) {
-  if (use_s2n_bignum_alt()) { return edwards25519_decode_alt(z, c); }
-  else { return edwards25519_decode(z, c); }
-}
-
-// Given a scalar n, returns point (X,Y) = n * B where B = (...,4/5) is
-// the standard basepoint for the edwards25519 (Ed25519) curve.
-// Input scalar[4]; output res[8]
-extern void edwards25519_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4]);
-extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4]);
-static inline void edwards25519_scalarmulbase_selector(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4]) {
-  if (use_s2n_bignum_alt()) { edwards25519_scalarmulbase_alt(res, scalar); }
-  else { edwards25519_scalarmulbase(res, scalar); }
-}
-
-// Given scalar = n, point = P and bscalar = m, returns in res
-// the point (X,Y) = n * P + m * B where B = (...,4/5) is
-// the standard basepoint for the edwards25519 (Ed25519) curve.
-//
-// Both 256-bit coordinates of the input point P are implicitly
-// reduced modulo 2^255-19 if they are not already in reduced form,
-// but the conventional usage is that they *are* already reduced.
-// The scalars can be arbitrary 256-bit numbers but may also be
-// considered as implicitly reduced modulo the group order.
-//
-// Input scalar[4], point[8], bscalar[4]; output res[8]
-extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4],
-        uint64_t point[S2N_BIGNUM_STATIC 8], uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
-extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4],
-        uint64_t point[S2N_BIGNUM_STATIC 8], uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
-static inline void edwards25519_scalarmuldouble_selector(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 8], uint64_t bscalar[S2N_BIGNUM_STATIC 4]) {
-  if (use_s2n_bignum_alt()) { edwards25519_scalarmuldouble_alt(res, scalar, point, bscalar); }
-  else { edwards25519_scalarmuldouble(res, scalar, point, bscalar); }
-}
-
-#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/CODE_OF_CONDUCT.md b/third_party/s2n-bignum/s2n-bignum-imported/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000000..5b627cfa60b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/CODE_OF_CONDUCT.md
@@ -0,0 +1,4 @@
+## Code of Conduct
+This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
+For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
+opensource-codeofconduct@amazon.com with any additional questions or comments.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/CONTRIBUTING.md b/third_party/s2n-bignum/s2n-bignum-imported/CONTRIBUTING.md
new file mode 100644
index 00000000000..c4b6a1c5081
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/CONTRIBUTING.md
@@ -0,0 +1,59 @@
+# Contributing Guidelines
+
+Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
+documentation, we greatly value feedback and contributions from our community.
+
+Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
+information to effectively respond to your bug report or contribution.
+
+
+## Reporting Bugs/Feature Requests
+
+We welcome you to use the GitHub issue tracker to report bugs or suggest features.
+
+When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
+reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
+
+* A reproducible test case or series of steps
+* The version of our code being used
+* Any modifications you've made relevant to the bug
+* Anything unusual about your environment or deployment
+
+
+## Contributing via Pull Requests
+Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
+
+1. You are working against the latest source on the *main* branch.
+2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
+3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
+
+To send us a pull request, please:
+
+1. Fork the repository.
+2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
+3. Ensure local tests pass.
+4. Commit to your fork using clear commit messages.
+5. Send us a pull request, answering any default questions in the pull request interface.
+6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
+
+GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
+[creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
+
+
+## Finding contributions to work on
+Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
+
+
+## Code of Conduct
+This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
+For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
+opensource-codeofconduct@amazon.com with any additional questions or comments.
+
+
+## Security issue notifications
+If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
+
+
+## Licensing
+
+See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/LICENSE b/third_party/s2n-bignum/s2n-bignum-imported/LICENSE
new file mode 100644
index 00000000000..7a5168f979d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/LICENSE
@@ -0,0 +1,222 @@
+SPDX-License-Identifier: Apache-2.0 OR ISC or MIT-0
+
+
+Apache 2.0 license
+-------------------------------------
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+
+ISC license
+-------------------------------------
+
+Copyright Amazon.com, Inc. or its affiliates.
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+
+MIT-0 license
+-------------------------------------
+
+Copyright 2021-2024 Amazon.com, Inc. or its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/NOTICE b/third_party/s2n-bignum/s2n-bignum-imported/NOTICE
new file mode 100644
index 00000000000..616fc588945
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/NOTICE
@@ -0,0 +1 @@
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/README.md b/third_party/s2n-bignum/s2n-bignum-imported/README.md
new file mode 100644
index 00000000000..769bfd8497e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/README.md
@@ -0,0 +1,512 @@
+## s2n-bignum
+
+s2n-bignum is a collection of integer arithmetic routines designed for
+cryptographic applications. All routines are written in pure machine code,
+designed to be callable from C and other high-level languages, with separate but
+API-compatible versions of each function for 64-bit x86 (x86_64) and ARM
+(aarch64).
+
+s2n-bignum's primary goals are performance and assurance: Assembly routines are
+tuned for highest performance both by hand and using automatic optimization
+techniques such as the [SLOTHY](https://github.com/slothy-optimizer/slothy)
+superoptimizer, and each function is accompanied by a machine-checked formal
+proof in [HOL-Light](https://hol-light.github.io/) that its mathematical
+result is correct, based on a formal model of the underlying machine. Each
+function is moreover written in a constant-time style to avoid timing
+side-channels.
+
+### Building
+
+Assuming a suitable operating system (e.g. Linux, Mac OS X, or Windows with
+Cygwin) and a few basic build tools you should be able to download the repo and
+build with just a few basic commands. On an x86 machine:
+
+    git clone https://github.com/awslabs/s2n-bignum
+    cd ./s2n-bignum
+    (cd ./x86; make)
+
+while on an ARM machine (aarch64, arm64) just replace "x86" with "arm":
+
+    git clone https://github.com/awslabs/s2n-bignum
+    cd ./s2n-bignum
+    (cd ./arm; make)
+
+This results in a library of bignum mathematical functions that can be
+called from C or other languages. To run basic unit tests on the library
+just built:
+
+    (cd ./tests; make go)
+
+To run the benchmarking code to get performance numbers for your platform
+(this usually takes several minutes):
+
+    (cd ./benchmarks; make go)
+
+The code is all written in assembler, with each individual mathematical
+function consisting of a `.S` file that can be assembled by directly
+invoking the GNU C compiler `gcc` or by explicitly combining the C
+preprocessor and an assembler or other C or C++ compiler. If using your own
+build command, consult the existing Makefiles for guidance since there are some
+subtle variations even among assemblers (e.g. some C compilers won't handle
+multiple instructions per line when taking in assembler files).
+
+### Using the library
+
+The build process above results in a library that can be used to provide all
+the functionality together (e.g. `x86/libs2nbignum.a` for an x86 machine),
+as well as individual object files, one per function, that can be used for more
+fine-grained linkage (e.g. `x86/generic/bignum_add.o` for the addition
+function on x86). The functions all use standard Application Binary Interfaces
+to connect to C and other high-level languages; the ABI determines, for
+example, which registers or stack frames hold the arguments to a function when
+called. The x86+Windows combination uses a non-standard ABI, which can
+explicitly be forced using the additional option `-DWINDOWS_ABI=1` when
+building. In either case the C-level prototypes for the functions are collected
+in a header file that can be included in C programs to specify the interfaces.
+A quick browse through this also gives an idea of what functions the
+library provides.
+
+[s2n-bignum/include/s2n-bignum.h](https://github.com/awslabs/s2n-bignum/blob/main/include/s2n-bignum.h)
+
+You can include this in a C program as usual, after first including
+the standard header defining the types `uint64_t` etc. that are
+basic for s2n-bignum:
+
+    #include <inttypes.h>
+    #include "s2n-bignum.h"
+
+Here is a small complete C program `myprogram.c` calling the
+library, computing the modular inverse of 12345 modulo the wordsize
+using the `word_negmodinv` function provided by the library,
+then printing out confirmation that it works:
+
+```
+#include <stdio.h>
+#include <inttypes.h>
+#include "s2n-bignum.h"
+
+int main(void)
+{
+  uint64_t x = 12345;
+  uint64_t y = -word_negmodinv(x);
+  printf("%ld * %ld = %ld (mod 2^64)\n",x,y,x*y);
+}
+```
+
+Assuming you are on an x86 machine in a directory above the
+`s2n-bignum` subdirectory (otherwise change the `.`
+below into an appropriate path and/or change `x86` to `arm`),
+you can compile this as follows, specifying the paths to the
+library itself and the headers:
+
+    gcc -o myprogram myprogram.c -I./s2n-bignum/include/ -L./s2n-bignum/x86/ -ls2nbignum
+
+and then run it as usual to see the output:
+
+    $ ./myprogram
+    12345 * 5288216061308878345 = 1 (mod 2^64)
+
+### Architectural and microarchitectural considerations
+
+The overall C-level interface supported by the library is the same
+regardless of architecture, ARM or x86. In each case, however, there
+are some architectural and microarchitectural considerations to be
+aware of:
+
+  * On ARM, each function will work correctly on any existing
+    microarchitecture. However, some functions have two variants
+    with significant performance differences according to platform.
+    The versions with `_alt` suffixes are designed to maximize
+    performance on microarchitectures with higher multiplier
+    throughput (typically more recent ones, like the Apple M1), while
+    the non-alt variants are better suited to 'traditional' ARM
+    microarchitectures with lower multiplier throughput (specifically,
+    limited pipelining of the `UMULH` instruction to get the
+    high part of a 64x64-bit product).
+
+  * On x86, all generic bignum functions (in the `x86/generic`
+    subdirectory) will work correctly on any existing microarchitecture.
+    Some of the more highly optimized functions for specific elliptic
+    curves etc. require the BMI and ADX instruction set extensions
+    (specifically the `MULX`, `ADCX` and `ADOX` instructions).
+    In such cases, the `_alt` suffix forms are provided
+    as a backup that will work for older platforms. In all cases where
+    there is such an alt form provided, the non-alt form is likely to be
+    faster where those instructions are supported, as on most recent
+    x86-64 chips.
+
+If you are unsure which version of a function to use on your platform, a simple
+test is to run the benchmarking code (see above) and examine the results. For
+example, this is a contemporary ARM platform where the alt form performs
+better:
+
+```
+...
+curve25519_x25519               : 26661.8 ns each (var  0.8%, corr  0.03) =      37507 ops/sec
+curve25519_x25519_alt           : 19297.7 ns each (var  0.4%, corr -0.03) =      51820 ops/sec
+...
+```
+
+and this is a typical x86 chip where the non-alt form is faster:
+
+```
+...
+curve25519_x25519               : 30103.0 ns each (var  0.0%, corr -0.14) =      33219 ops/sec
+curve25519_x25519_alt           : 38097.0 ns each (var  0.0%, corr -0.11) =      26249 ops/sec
+...
+```
+
+while this is a very old x86 machine where the required instructions for
+the non-alt form are not supported:
+
+```
+...
+curve25519_x25519               :             *** NOT APPLICABLE  ***
+curve25519_x25519_alt           : 51977.2 ns each (var  1.4%, corr  0.01) =      19239 ops/sec
+...
+```
+
+### Constant-time bignums
+
+The s2n-bignum library provides a simple and flexible API for manipulating
+bignums, which are integers of arbitrary size (operations focus on nonnegative
+integers, but use 2s complement where appropriate for negation). The integers
+are represented as little-endian arrays of unsigned 64-bit "digits", where the
+digits can be accessed via the standard `uint64_t` type in C. They can be
+explicitly read and written as normal C arrays as well as via the s2n-bignum
+API. For example, here is how one might set up the constant 2<sup>255</sup>-19
+as a 4-digit bignum (note the little-endian digit representation, independent
+of the byte order of the underlying machine):
+
+```
+uint64_t p_25519[4] =
+{
+   UINT64_C(0xffffffffffffffed),
+   UINT64_C(0xffffffffffffffff),
+   UINT64_C(0xffffffffffffffff),
+   UINT64_C(0x7fffffffffffffff)
+};
+```
+
+The arrays can be arbitrarily large or small and the sizes can be runtime
+parameters, with no overall restriction to specific sizes like 4 in the example
+above. However, in contrast to many standard bignum interfaces like that
+supported by [GMP](https://gmplib.org/), the operations do not dynamically
+adjust the sizes, but require them to be explicitly specified by the user when
+calling each function. The reason for this is to allow flexibility and
+genericity while also enforcing "constant-time" behavior for security from
+timing side-channels in cryptographic applications.
+
+By "constant-time" we mean roughly that a given bignum operation takes a time
+that is independent of the actual numbers involved, depending only on their
+*nominal* sizes. Each s2n-bignum operation takes and returns bignums
+of specified nominal sizes, and manipulates them on the basis of the nominal
+sizes only, independent of their actual numeric values (even if those are
+zero). If a result does not fit in the size provided, it is systematically
+truncated modulo that size. s2n-bignum functions never strip away leading
+zeros to make numbers shorter, nor do they allocate extra space to make them
+longer; indeed, they perform no memory allocation or other OS calls at all.
+For instance, the basic multiplication function has the following C prototype:
+
+```
+void bignum_mul(uint64_t p,uint64_t *z, uint64_t m,uint64_t *x, uint64_t n,uint64_t *y);
+```
+
+This means that `x` points to an `m`-digit bignum (little-endian, with
+64-bit words as the digits), `y` points to an `n`-digit bignum, and the
+function writes their product to the `p`-word buffer pointed to by `z`,
+truncating it modulo 2<sup>64p</sup> if it doesn't fit. In this setting
+with nominal sizes for all numbers, the "constant-time" characteristic means
+that the actual sequence of machine instructions executed, including the
+specific addresses and sequencing of memory loads and stores, is
+*independent of the numbers themselves*, depending only on their nominal sizes
+(`m`, `n` and `p` for the above example).
+
+Since the s2n-bignum interface is just using pointers to pre-existing arrays,
+any allocation of memory is the caller's responsibility. Some s2n-bignum
+functions use space on the stack for intermediate computations (or just to save
+and restore registers), but only in cases where that size is bounded and
+moderate. For the few generic-size functions that need similarly generic (and
+hence unbounded a priori) space for intermediate storage, it needs to be
+provided by the caller via an additional argument. For example, the final
+argument to the `bignum_modinv` (modular inverse) function is to a temporary
+buffer of a size depending on the generic size parameter `k` (specifically,
+according to the API it should be `>= 3 * k`):
+
+```
+void bignum_modinv (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t);
+```
+
+In order to keep the generic API more convenient, minimizing the need for such
+additional parameters, functions sometimes read from and write to the provided
+buffers in interleaved fashion in a way that assumes inputs and outputs do not
+overlap. Aliasing of input and output buffers is however usually allowed in
+fixed-size functions and (provided they are exactly the same, not overlapped in
+more intricate fashion) "linear" generic-sized functions; consult the detailed
+API reference for more details.
+
+### What's in the library?
+
+The s2n-bignum library supports basic bignum arithmetic using the API specified
+above, as well as a host of related operations, the aim being to provide
+convenient and reliable building-blocks for higher-level cryptographic
+functionality. The range of operations provided covers:
+
+- Elementary operations on 64-bit words, mainly to provide reference
+  implementations that are constant-time, e.g. `word_max` (maximum),
+  `word_clz` (counting leading zeros)
+
+- Basic generic-size bignum arithmetic functionality like `bignum_add`
+  (addition), `bignum_sub` (subtraction), `bignum_mul` (multiplication),
+  `bignum_eq` (equality comparison).
+
+- Generic-size constant-time data manipulation like `bignum_digit` (selecting
+  a digit, like array indexing but without any difference in memory access
+  pattern from element number) and `bignum_mux` (multiplexing or if-then-else,
+  analogous to C `b ? x : y`.
+
+- Generic-size Montgomery operations like `bignum_montmul` (Montgomery
+  multiplication), `bignum_montredc` (Montgomery reduction) and
+  `bignum_montifier` (computes constant for mapping into Montgomery form)
+  for performing modular arithmetic in Montgomery form for any odd modulus.
+
+- Optimized multiplication and squaring operations for specific sizes, e.g.
+  `bignum_mul_4_8` (multiply two 4-digit numbers with 8-digit result) and
+  `bignum_sqr_16_32` (square a 16-digit number with 32-digit result).
+
+- Optimized modular and/or Montgomery arithmetic operations for common
+  primes that are field characteristics for  specific elliptic curves,
+  e.g. `bignum_montmul_p521` (Montgomery multiplication modulo
+  2<sup>521</sup>-1) for NIST P-521, `bignum_sqr_p25519` (modular
+  squaring modulo 2<sup>255</sup>-19 for curve25519).
+
+- Full top-level point operations for supported elliptic curves, e.g.
+  `p256_jadd` (point addition on NIST P-256 curve), `secp256k1_jdouble`
+  (point doubling for secp256k1). These usually assume a particular
+  coordinate representation, Jacobian in these cases (hence the "j").
+
+The elliptic curves with some special support are the following; the degree of
+support varies from just modular and/or Montgomery arithmetic operations for
+the field characteristic modulus, up to basic point operations, and even in
+some cases full scalar multiplication (e.g. `curve25519_x25519`).
+
+- curve25519/edwards25519
+- NIST P-256
+- NIST P-384
+- NIST P-521
+- secp256k1
+- SM2
+
+### Testing and formal verification
+
+The basic testing setup as mentioned above subjects each function to a number
+of unit tests, mainly using pseudo-random inputs and comparing against
+conceptually simpler (but neither efficient nor constant-time) C references,
+also doing some checking of pre-tabulated "known correct" results. This
+process
+
+    (cd ./tests; make go)
+
+should be enough to expose any basic problems, typically failure to assemble
+and link the code correctly. However, in pursuit of the highest standards of
+correctness, that basic testing is complemented by the far more rigorous and
+sophisticated process of *formal verification*.
+
+The formal verification process performs a machine-checked proof that the
+actual object file generated by the build process satisfies a high-level
+mathematical specification for *all* inputs (not just for specific test cases),
+assuming a formal model of how each processor (ARM or x86) executes code. These
+models make some simplifications and idealizations but model pretty faithfully
+the way in which specific machine instructions modify registers, flags and
+memory.
+
+To perform the formal proof for a particular function, you will need to install
+the latest version of [HOL Light](https://github.com/jrh13/hol-light/).
+The OPAM version might not work because it does not contain sufficiently recent
+libraries.
+To install HOL Light, please follow its
+[README](https://github.com/jrh13/hol-light/blob/master/README) instruction.
+After installation, set the `HOLDIR` environment variable to the path of
+the `hol-light` directory and use the Makefile within either the `arm` or
+`x86` directories to generate a target of the form
+`function_name.correct` for a corresponding object file `function_name.o`.
+Alternatively, the entire collection of functions can all be formally proved
+via the `proofs` pseudo-target. This is likely to be very time-consuming and
+hence better executed with some parallelism, e.g.
+
+    nohup make -j 16 proofs &
+
+The proof process is controlled by a corresponding "proof script" in the
+`proofs` subdirectory with corresponding name `proofs/function_name.ml`
+The technical details of how the machine is modeled and how the proof is
+performed are too involved to enter into in detail in this brief summary,
+but by examining the proof script file you can find detailed specifications
+for each function, which might be considered the most rigorous possible
+form of API documentation.
+
+For example the file `arm/proofs/bignum_mul_p25519.ml` starts with a lengthy
+sequence of 32-bit words that specify the machine code being verified. This is
+not just accepted a priori as the canonical machine code, but actually checked
+against the object file to make sure it is indeed what is generated by the
+build process. The later proof then shows that executing this on the idealized
+machine model guarantees some toplevel mathematical properties. In this case,
+the specification that is proved looks like this:
+
+```
+nonoverlapping (word pc,0x288) (z,8 * 4)
+ ==> ensures arm
+      (\s. aligned_bytes_loaded s (word pc) bignum_mul_p25519_mc /\
+           read PC s = word pc /\
+           read X30 s = returnaddress /\
+           C_ARGUMENTS [z; x; y] s /\
+           bignum_from_memory(x,4) s = m /\
+           bignum_from_memory(y,4) s = n)
+      (\s. read PC s = returnaddress /\
+           bignum_from_memory(z,4) s = (m * n) MOD p_25519)
+      (MAYCHANGE [PC; X1; X2; X3; X4; X5; X6; X7; X8; X9;
+                  X10; X11; X12; X13; X14; X15; X16; X17] ,,
+       MAYCHANGE [memory :> bytes(z,8 * 4)] ,,
+       MAYCHANGE SOME_FLAGS)
+```
+
+A detailed understanding of these formal specifications would take careful
+study of the underlying logical definitions, but in somewhat general
+impressionistic terms we can turn it into English as follows:
+
+ - We assume the output buffer `z` doesn't overlap the code being executed
+   but otherwise make no aliasing assumptions for inputs versus outputs.
+
+ - ASSUMING that we start in a state where
+
+    - the machine code specified at the start is loaded (4-byte aligned
+      as per ARM restrictions) and the program counter register `PC`
+      points to the start of it
+
+    - the return address to the caller is in register `X30` as per ABI
+
+    - the pointers `z`, `x` and `y` are set up in registers according
+      to the standard ABI rules
+
+    - the pointers `x` and `y` point at 4-digits bignums with respective
+      values `m` and `n`
+
+ - THEN we will reach another state where
+
+    - The program counter `PC` has jumped to the return address
+
+    - The buffer pointed to by `z` contains the mathematical answer
+      (x * y) mod p_25519, where p_25519 is an abbreviation for
+      2<sup>255</sup>-19.
+
+ - BETWEEN initial and final states, the only components of the
+   machine that could have been modified are:
+
+    - Registers including the program counter (of course) and general
+      purpose registers `X1`, ..., `X17` (freely modifiable by a subroutine
+      according to the ABI)
+
+    - The specified output buffer at address `z` of size 4 x 8-byte words.
+
+    - The machine flags (also freely modifiable according to the ABI)
+
+**Global Assumptions.**
+In addition to the assumptions described in the formal specifications,
+s2n-bignum implementations globally assume that the execution environment is
+configured as following:
+
+- Alignment checking is disabled (`AC` flag in x86, `SCTLR_ELx.A` in ARM).
+  If these control bits are set, passing unaligned pointers as input/output
+  buffers of a s2n-bignum function may cause a crash. If you are invoking the
+  functions from C/C++ via the C header file (`s2n-bignum.h`) however, the
+  alignment restriction on int-typed pointers in C standard such as `uint64_t*`
+  will guarantee that the pointers are aligned regardless of the control bit.
+  The alignment conditions for code and stack pointers in ARM will be
+  explicitly described in the formal specifications.
+
+
+- Little-endian is set in ARM (`E` mask of `CPSR` in ARM). We believe all code
+  works equally well on a big-endian machine, but we do not validate that fact
+  ourselves, and the instruction model underlying the formal proof does not
+  directly address this question since it is assuming little-endian.
+
+- It is assumed that s2n-bignum is run on 64-bit mode.
+
+### Benchmarking and "constant time"
+
+The benchmarking setup included in the repository can be invoked, as mentioned
+above, by the following, starting in the s2n-bignum root directory and after
+building the library:
+
+    (cd ./benchmarks; make go)
+
+After some explanatory information which summarizes the explanations below,
+this shows a list of the execution time behavior of each function on the
+current platform, one per line in alphabetical order; generic-size functions
+like `bignum_add` are exercised on one or more specific sizes as shown in
+parentheses after the function name.
+
+```
+bignum_add (4x4->4)             :     3.4 ns each (var  1.6%, corr  0.07) =  296073608 ops/sec
+bignum_add (6x6->6)             :     4.3 ns each (var  1.3%, corr  0.02) =  233426704 ops/sec
+bignum_add (32x32->32)          :    18.4 ns each (var  0.8%, corr -0.01) =   54430655 ops/sec
+bignum_add_p25519               :     2.2 ns each (var  2.9%, corr -0.01) =  462501779 ops/sec
+bignum_add_p256                 :     2.9 ns each (var  1.6%, corr -0.01) =  342429670 ops/sec
+bignum_add_p256k1               :     2.6 ns each (var  1.9%, corr -0.04) =  387458274 ops/sec
+bignum_add_p384                 :     4.4 ns each (var  1.1%, corr -0.03) =  226923614 ops/sec
+bignum_add_p521                 :     4.3 ns each (var  1.4%, corr  0.02) =  232991612 ops/sec
+bignum_amontifier (32)          :  2993.4 ns each (var  0.1%, corr -0.08) =     334073 ops/sec
+bignum_amontmul (32)            :  2410.8 ns each (var  0.0%, corr -0.04) =     414797 ops/sec
+bignum_amontredc (32/16 -> 16)  :   317.1 ns each (var  0.1%, corr -0.01) =    3153693 ops/sec
+bignum_amontsqr (32 -> 32)      :  2410.2 ns each (var  0.0%, corr  0.05) =     414901 ops/sec
+...
+word_max                        :     0.8 ns each (var  4.2%, corr -0.03) = 1234333460 ops/sec
+word_min                        :     0.8 ns each (var  3.4%, corr  0.05) = 1237623762 ops/sec
+word_negmodinv                  :     2.7 ns each (var  2.0%, corr -0.11) =  366568915 ops/sec
+word_recip                      :     7.4 ns each (var  0.9%, corr -0.06) =  134380815 ops/sec
+```
+
+The first number reported is the average runtime, in nanoseconds (1 ns =
+10<sup>-9</sup> seconds, or one billionth of a second), over a large number of
+calls, and the last one is the reciprocal of this to give the average number of
+operations per second. Hence "smaller is better" for the first number while
+"bigger is better" for the final one.
+
+The "var" and "corr" numbers in parentheses attempt to give some empirical
+results on the variation in runtime with respect to the data being manipulated.
+Since this is intended to be invariant, one wishes these numbers to be small,
+though there is inevitably some variation because of miscellaneous platform
+factors. For each "bit density" between 0 and 64, pseudo-random inputs are
+generated with that bit density; the bit density is essentially the average
+number of 1 bits in each 64-bit word of these pseudo-random numbers (so bit
+density 0 means all zeros, bit density 64 means all 1s). The function is
+separately timed over each of these. The end results give the coefficient of
+variation "var" (standard deviation divided by mean) and correlation
+coefficient "corr" of runtime with bit density.
+
+As explained above, the "constant time" design principle is that the sequence
+of machine instructions executed, including the access pattern of memory reads
+and writes, is independent of the actual numeric data being manipulated, once
+any parametric sizes are fixed. Any failures in practice to actually take
+exactly the same time on all data (beyond some expected experimental errors
+and flaws in the timing framework) could only arise if either:
+
+ - The above "constant time" design discipline is not followed at all points
+   as intended. We consider this very unlikely, but in contrast to functional
+   correctness it is not actually rigorously machine-checked at present. We
+   anticipate in the future subjecting the code to automated dataflow analysis
+   as an additional validation test.
+
+ - Some individual machine instructions that are used take a time that depends
+   on their data. We have specifically avoided certain machine instructions
+   known to be problematic in this respect (e.g. division instructions), but
+   we have no absolute guarantees from the hardware makers that there are no
+   such variations in the instructions we use, except on ARM platforms where
+   the "DIT" = "data-independent timing" bit is set.
+
+## Security
+
+See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
+
+## License
+
+This project is licensed under the Apache-2.0 License or the ISC License or the MIT-0 License.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/Makefile
new file mode 100644
index 00000000000..e1d37985dd3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/Makefile
@@ -0,0 +1,518 @@
+#############################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+#############################################################################
+
+OSTYPE_RESULT=$(shell uname -s)
+ARCHTYPE_RESULT=$(shell uname -m)
+
+# Assembler directives that mark symbols as .hidden
+# or .private_extern can be enabled by passing
+# in the S2N_BN_HIDE_SYMBOLS parameter as:
+#
+#    make S2N_BN_HIDE_SYMBOLS=1
+#
+
+ifeq ($(S2N_BN_HIDE_SYMBOLS),1)
+SYMBOL_HIDING=-DS2N_BN_HIDE_SYMBOLS=1
+else
+SYMBOL_HIDING=
+endif
+
+
+# Add explicit language input parameter to cpp, otherwise the use of #n for
+# numeric literals in ARM code is a problem when used inside #define macros
+# since normally that means stringization.
+#
+# Some clang-based preprocessors seem to behave differently, and get confused
+# by single-quote characters in comments, so we eliminate // comments first.
+
+ifeq ($(OSTYPE_RESULT),Darwin)
+PREPROCESS=sed -e 's/\/\/.*//' | $(CC) -E -I../include $(SYMBOL_HIDING) -xassembler-with-cpp -
+else
+PREPROCESS=$(CC) -E -I../include $(SYMBOL_HIDING) -xassembler-with-cpp -
+endif
+
+# Generally GNU-type assemblers are happy with multiple instructions on
+# a line, but we split them up anyway just in case.
+
+SPLIT=tr ';' '\n'
+
+# If actually on an ARM8 machine, just use the assembler (as). Otherwise
+# use a cross-assembling version so that the code can still be assembled
+# and the proofs checked against the object files (though you won't be able
+# to run code without additional emulation infrastructure). For the clang
+# version on OS X we just add the "-arch arm64" option. For the Linux/gcc
+# toolchain we assume the presence of the special cross-assembler. This
+# can be installed via something like:
+#
+#  sudo apt-get install binutils-aarch64-linux-gnu
+
+ifeq ($(ARCHTYPE_RESULT),aarch64)
+ASSEMBLE=as
+OBJDUMP=objdump -d
+else
+ifeq ($(ARCHTYPE_RESULT),arm64)
+ASSEMBLE=as
+OBJDUMP=objdump -d
+else
+ifeq ($(OSTYPE_RESULT),Darwin)
+ASSEMBLE=as -arch arm64
+OBJDUMP=otool -tvV
+else
+ASSEMBLE=aarch64-linux-gnu-as
+OBJDUMP=aarch64-linux-gnu-objdump -d
+endif
+endif
+endif
+
+# List of object files for point operations and bignum operations
+
+POINT_OBJ = curve25519/curve25519_ladderstep.o \
+            curve25519/curve25519_ladderstep_alt.o \
+            curve25519/curve25519_pxscalarmul.o \
+            curve25519/curve25519_pxscalarmul_alt.o \
+            curve25519/curve25519_x25519.o \
+            curve25519/curve25519_x25519_alt.o \
+            curve25519/curve25519_x25519_byte.o \
+            curve25519/curve25519_x25519_byte_alt.o \
+            curve25519/curve25519_x25519base.o \
+            curve25519/curve25519_x25519base_alt.o \
+            curve25519/curve25519_x25519base_byte.o \
+            curve25519/curve25519_x25519base_byte_alt.o \
+            curve25519/edwards25519_decode.o \
+            curve25519/edwards25519_decode_alt.o \
+            curve25519/edwards25519_encode.o \
+            curve25519/edwards25519_epadd.o \
+            curve25519/edwards25519_epadd_alt.o \
+            curve25519/edwards25519_epdouble.o \
+            curve25519/edwards25519_epdouble_alt.o \
+            curve25519/edwards25519_pdouble.o \
+            curve25519/edwards25519_pdouble_alt.o \
+            curve25519/edwards25519_pepadd.o \
+            curve25519/edwards25519_pepadd_alt.o \
+            curve25519/edwards25519_scalarmulbase.o \
+            curve25519/edwards25519_scalarmulbase_alt.o \
+            curve25519/edwards25519_scalarmuldouble.o \
+            curve25519/edwards25519_scalarmuldouble_alt.o \
+            p256/p256_montjadd.o \
+            p256/p256_montjadd_alt.o \
+            p256/p256_montjdouble.o \
+            p256/p256_montjdouble_alt.o \
+            p256/p256_montjmixadd.o \
+            p256/p256_montjmixadd_alt.o \
+            p256/p256_montjscalarmul.o \
+            p256/p256_montjscalarmul_alt.o \
+            p256/p256_scalarmul.o \
+            p256/p256_scalarmul_alt.o \
+            p256/p256_scalarmulbase.o \
+            p256/p256_scalarmulbase_alt.o \
+            p384/p384_montjadd.o \
+            p384/p384_montjadd_alt.o \
+            p384/p384_montjdouble.o \
+            p384/p384_montjdouble_alt.o \
+            p384/p384_montjmixadd.o \
+            p384/p384_montjmixadd_alt.o \
+            p384/p384_montjscalarmul.o \
+            p384/p384_montjscalarmul_alt.o \
+            p521/p521_jadd.o \
+            p521/p521_jadd_alt.o \
+            p521/p521_jdouble.o \
+            p521/p521_jdouble_alt.o \
+            p521/p521_jmixadd.o \
+            p521/p521_jmixadd_alt.o \
+            p521/p521_jscalarmul.o \
+            p521/p521_jscalarmul_alt.o \
+            secp256k1/secp256k1_jadd.o \
+            secp256k1/secp256k1_jadd_alt.o \
+            secp256k1/secp256k1_jdouble.o \
+            secp256k1/secp256k1_jdouble_alt.o \
+            secp256k1/secp256k1_jmixadd.o \
+            secp256k1/secp256k1_jmixadd_alt.o \
+            sm2/sm2_montjadd.o \
+            sm2/sm2_montjadd_alt.o \
+            sm2/sm2_montjdouble.o \
+            sm2/sm2_montjdouble_alt.o \
+            sm2/sm2_montjmixadd.o \
+            sm2/sm2_montjmixadd_alt.o \
+            sm2/sm2_montjscalarmul.o \
+            sm2/sm2_montjscalarmul_alt.o
+
+BIGNUM_OBJ = curve25519/bignum_add_p25519.o \
+             curve25519/bignum_cmul_p25519.o \
+             curve25519/bignum_double_p25519.o \
+             curve25519/bignum_inv_p25519.o \
+             curve25519/bignum_invsqrt_p25519.o \
+             curve25519/bignum_invsqrt_p25519_alt.o \
+             curve25519/bignum_madd_n25519.o \
+             curve25519/bignum_madd_n25519_alt.o \
+             curve25519/bignum_mod_m25519_4.o \
+             curve25519/bignum_mod_n25519.o \
+             curve25519/bignum_mod_n25519_4.o \
+             curve25519/bignum_mod_p25519_4.o \
+             curve25519/bignum_mul_p25519.o \
+             curve25519/bignum_mul_p25519_alt.o \
+             curve25519/bignum_neg_p25519.o \
+             curve25519/bignum_optneg_p25519.o \
+             curve25519/bignum_sqr_p25519.o \
+             curve25519/bignum_sqr_p25519_alt.o \
+             curve25519/bignum_sqrt_p25519.o \
+             curve25519/bignum_sqrt_p25519_alt.o \
+             curve25519/bignum_sub_p25519.o \
+             fastmul/bignum_emontredc_8n.o \
+             fastmul/bignum_emontredc_8n_cdiff.o \
+             fastmul/bignum_kmul_16_32.o \
+             fastmul/bignum_kmul_32_64.o \
+             fastmul/bignum_ksqr_16_32.o \
+             fastmul/bignum_ksqr_32_64.o \
+             fastmul/bignum_mul_4_8.o \
+             fastmul/bignum_mul_4_8_alt.o \
+             fastmul/bignum_mul_6_12.o \
+             fastmul/bignum_mul_6_12_alt.o \
+             fastmul/bignum_mul_8_16.o \
+             fastmul/bignum_mul_8_16_alt.o \
+             fastmul/bignum_sqr_4_8.o \
+             fastmul/bignum_sqr_4_8_alt.o \
+             fastmul/bignum_sqr_6_12.o \
+             fastmul/bignum_sqr_6_12_alt.o \
+             fastmul/bignum_sqr_8_16.o \
+             fastmul/bignum_sqr_8_16_alt.o \
+             generic/bignum_add.o \
+             generic/bignum_amontifier.o \
+             generic/bignum_amontmul.o \
+             generic/bignum_amontredc.o \
+             generic/bignum_amontsqr.o \
+             generic/bignum_bitfield.o \
+             generic/bignum_bitsize.o \
+             generic/bignum_cdiv.o \
+             generic/bignum_cdiv_exact.o \
+             generic/bignum_cld.o \
+             generic/bignum_clz.o \
+             generic/bignum_cmadd.o \
+             generic/bignum_cmnegadd.o \
+             generic/bignum_cmod.o \
+             generic/bignum_cmul.o \
+             generic/bignum_coprime.o \
+             generic/bignum_copy.o \
+             generic/bignum_copy_row_from_table.o \
+             generic/bignum_copy_row_from_table_8n.o \
+             generic/bignum_copy_row_from_table_16.o \
+             generic/bignum_copy_row_from_table_32.o \
+             generic/bignum_ctd.o \
+             generic/bignum_ctz.o \
+             generic/bignum_demont.o \
+             generic/bignum_digit.o \
+             generic/bignum_digitsize.o \
+             generic/bignum_divmod10.o \
+             generic/bignum_emontredc.o \
+             generic/bignum_eq.o \
+             generic/bignum_even.o \
+             generic/bignum_ge.o \
+             generic/bignum_gt.o \
+             generic/bignum_iszero.o \
+             generic/bignum_le.o \
+             generic/bignum_lt.o \
+             generic/bignum_madd.o \
+             generic/bignum_modadd.o \
+             generic/bignum_moddouble.o \
+             generic/bignum_modexp.o \
+             generic/bignum_modifier.o \
+             generic/bignum_modinv.o \
+             generic/bignum_modoptneg.o \
+             generic/bignum_modsub.o \
+             generic/bignum_montifier.o \
+             generic/bignum_montmul.o \
+             generic/bignum_montredc.o \
+             generic/bignum_montsqr.o \
+             generic/bignum_mul.o \
+             generic/bignum_muladd10.o \
+             generic/bignum_mux.o \
+             generic/bignum_mux16.o \
+             generic/bignum_negmodinv.o \
+             generic/bignum_nonzero.o \
+             generic/bignum_normalize.o \
+             generic/bignum_odd.o \
+             generic/bignum_of_word.o \
+             generic/bignum_optadd.o \
+             generic/bignum_optneg.o \
+             generic/bignum_optsub.o \
+             generic/bignum_optsubadd.o \
+             generic/bignum_pow2.o \
+             generic/bignum_shl_small.o \
+             generic/bignum_shr_small.o \
+             generic/bignum_sqr.o \
+             generic/bignum_sub.o \
+             generic/word_bytereverse.o \
+             generic/word_clz.o \
+             generic/word_ctz.o \
+             generic/word_divstep59.o \
+             generic/word_max.o \
+             generic/word_min.o \
+             generic/word_negmodinv.o \
+             generic/word_popcount.o \
+             generic/word_recip.o \
+             p256/bignum_add_p256.o \
+             p256/bignum_bigendian_4.o \
+             p256/bignum_cmul_p256.o \
+             p256/bignum_deamont_p256.o \
+             p256/bignum_demont_p256.o \
+             p256/bignum_double_p256.o \
+             p256/bignum_half_p256.o \
+             p256/bignum_inv_p256.o \
+             p256/bignum_littleendian_4.o \
+             p256/bignum_mod_n256.o \
+             p256/bignum_mod_n256_4.o \
+             p256/bignum_mod_p256.o \
+             p256/bignum_mod_p256_4.o \
+             p256/bignum_montinv_p256.o \
+             p256/bignum_montmul_p256.o \
+             p256/bignum_montmul_p256_alt.o \
+             p256/bignum_montsqr_p256.o \
+             p256/bignum_montsqr_p256_alt.o \
+             p256/bignum_mux_4.o \
+             p256/bignum_neg_p256.o \
+             p256/bignum_nonzero_4.o \
+             p256/bignum_optneg_p256.o \
+             p256/bignum_sub_p256.o \
+             p256/bignum_tomont_p256.o \
+             p256/bignum_triple_p256.o \
+             p384/bignum_add_p384.o \
+             p384/bignum_bigendian_6.o \
+             p384/bignum_cmul_p384.o \
+             p384/bignum_deamont_p384.o \
+             p384/bignum_demont_p384.o \
+             p384/bignum_double_p384.o \
+             p384/bignum_half_p384.o \
+             p384/bignum_inv_p384.o \
+             p384/bignum_littleendian_6.o \
+             p384/bignum_mod_n384.o \
+             p384/bignum_mod_n384_6.o \
+             p384/bignum_mod_p384.o \
+             p384/bignum_mod_p384_6.o \
+             p384/bignum_montinv_p384.o \
+             p384/bignum_montmul_p384.o \
+             p384/bignum_montmul_p384_alt.o \
+             p384/bignum_montsqr_p384.o \
+             p384/bignum_montsqr_p384_alt.o \
+             p384/bignum_mux_6.o \
+             p384/bignum_neg_p384.o \
+             p384/bignum_nonzero_6.o \
+             p384/bignum_optneg_p384.o \
+             p384/bignum_sub_p384.o \
+             p384/bignum_tomont_p384.o \
+             p384/bignum_triple_p384.o \
+             p521/bignum_add_p521.o \
+             p521/bignum_cmul_p521.o \
+             p521/bignum_deamont_p521.o \
+             p521/bignum_demont_p521.o \
+             p521/bignum_double_p521.o \
+             p521/bignum_fromlebytes_p521.o \
+             p521/bignum_half_p521.o \
+             p521/bignum_inv_p521.o \
+             p521/bignum_mod_n521_9.o \
+             p521/bignum_mod_p521_9.o \
+             p521/bignum_montmul_p521.o \
+             p521/bignum_montmul_p521_alt.o \
+             p521/bignum_montsqr_p521.o \
+             p521/bignum_montsqr_p521_alt.o \
+             p521/bignum_mul_p521.o \
+             p521/bignum_mul_p521_alt.o \
+             p521/bignum_neg_p521.o \
+             p521/bignum_optneg_p521.o \
+             p521/bignum_sqr_p521.o \
+             p521/bignum_sqr_p521_alt.o \
+             p521/bignum_sub_p521.o \
+             p521/bignum_tolebytes_p521.o \
+             p521/bignum_tomont_p521.o \
+             p521/bignum_triple_p521.o \
+             secp256k1/bignum_add_p256k1.o \
+             secp256k1/bignum_cmul_p256k1.o \
+             secp256k1/bignum_deamont_p256k1.o \
+             secp256k1/bignum_demont_p256k1.o \
+             secp256k1/bignum_double_p256k1.o \
+             secp256k1/bignum_half_p256k1.o \
+             secp256k1/bignum_mod_n256k1_4.o \
+             secp256k1/bignum_mod_p256k1_4.o \
+             secp256k1/bignum_montmul_p256k1.o \
+             secp256k1/bignum_montmul_p256k1_alt.o \
+             secp256k1/bignum_montsqr_p256k1.o \
+             secp256k1/bignum_montsqr_p256k1_alt.o \
+             secp256k1/bignum_mul_p256k1.o \
+             secp256k1/bignum_mul_p256k1_alt.o \
+             secp256k1/bignum_neg_p256k1.o \
+             secp256k1/bignum_optneg_p256k1.o \
+             secp256k1/bignum_sqr_p256k1.o \
+             secp256k1/bignum_sqr_p256k1_alt.o \
+             secp256k1/bignum_sub_p256k1.o \
+             secp256k1/bignum_tomont_p256k1.o \
+             secp256k1/bignum_triple_p256k1.o \
+             sm2/bignum_add_sm2.o \
+             sm2/bignum_cmul_sm2.o \
+             sm2/bignum_deamont_sm2.o \
+             sm2/bignum_demont_sm2.o \
+             sm2/bignum_double_sm2.o \
+             sm2/bignum_half_sm2.o \
+             sm2/bignum_inv_sm2.o \
+             sm2/bignum_mod_nsm2.o \
+             sm2/bignum_mod_nsm2_4.o \
+             sm2/bignum_mod_sm2.o \
+             sm2/bignum_mod_sm2_4.o \
+             sm2/bignum_montinv_sm2.o \
+             sm2/bignum_montmul_sm2.o \
+             sm2/bignum_montmul_sm2_alt.o \
+             sm2/bignum_montsqr_sm2.o \
+             sm2/bignum_montsqr_sm2_alt.o \
+             sm2/bignum_neg_sm2.o \
+             sm2/bignum_optneg_sm2.o \
+             sm2/bignum_sub_sm2.o \
+             sm2/bignum_tomont_sm2.o \
+             sm2/bignum_triple_sm2.o
+
+UNOPT_OBJ = p256/unopt/bignum_montmul_p256_base.o \
+            p256/unopt/bignum_montsqr_p256_base.o \
+            p256/unopt/p256_montjadd.o \
+            p256/unopt/p256_montjdouble.o \
+            p384/unopt/bignum_montmul_p384_base.o \
+            p384/unopt/bignum_montsqr_p384_base.o \
+            p384/unopt/p384_montjadd.o \
+            p384/unopt/p384_montjdouble.o \
+            p521/unopt/bignum_montmul_p521_base.o \
+            p521/unopt/bignum_montsqr_p521_base.o \
+            p521/unopt/bignum_mul_p521_base.o \
+            p521/unopt/bignum_sqr_p521_base.o \
+            fastmul/unopt/bignum_emontredc_8n_base.o \
+            fastmul/unopt/bignum_emontredc_8n_cdiff_base.o \
+            fastmul/unopt/bignum_mul_8_16_base.o \
+            fastmul/unopt/bignum_sqr_8_16_base.o
+
+OBJ = $(POINT_OBJ) $(BIGNUM_OBJ)
+
+# Tutorial assembly files
+
+TUTORIAL_PROOFS = $(wildcard tutorial/*.ml)
+
+TUTORIAL_OBJ = $(TUTORIAL_PROOFS:.ml=.o) tutorial/rel_loop2.o tutorial/rel_simp2.o tutorial/rel_veceq2.o tutorial/rel_equivtac2.o tutorial/rel_reordertac2.o
+
+# According to
+# https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms,
+# x18 should not be used for Apple platforms. Check this using grep.
+
+%.o : %.S
+	cat $< | $(PREPROCESS) | $(SPLIT) | grep -v -E '^\s+.quad\s+0x[0-9a-f]+$$' | $(ASSEMBLE) -o $@ -
+	$(OBJDUMP) $@ | ( ( ! grep --ignore-case -E 'w18|[^0]x18' ) || ( rm $@ ; exit 1 ) )
+	cat $< | $(PREPROCESS) | $(SPLIT) | $(ASSEMBLE) -o $@ -
+
+libs2nbignum.a: $(OBJ) ; ar -rc libs2nbignum.a $(OBJ)
+
+clean:; rm -f libs2nbignum.a */*.o */*/*.o */*.correct */*.native
+
+# Proof-related parts
+#
+# The proof files are all independent, though each one loads the
+# same common infrastructure "base.ml". So you can potentially
+# run the proofs in parallel for more speed, e.g.
+#
+#    nohup make -j 16 proofs &
+#
+# If you build hol-light yourself (see https://github.com/jrh13/hol-light)
+# in your home directory, and do "make" inside the subdirectory hol-light,
+# then the following HOLDIR setting should be right:
+
+HOLDIR?=$(HOME)/hol-light
+HOLLIGHT:=$(HOLDIR)/hol.sh
+
+PROOF_BINS = $(OBJ:.o=.native)
+PROOF_LOGS = $(OBJ:.o=.correct)
+TUTORIAL_PROOF_BINS = $(TUTORIAL_PROOFS:.ml=.native)
+TUTORIAL_PROOF_LOGS = $(TUTORIAL_PROOFS:.ml=.correct)
+
+# Build precompiled native binaries of HOL Light proofs
+
+proofs/simulator.native: proofs/simulator.ml ; ../tools/build-proof.sh proofs/simulator.ml "$(HOLLIGHT)" "$@"
+
+.SECONDEXPANSION:
+%.native: proofs/$$(*F).ml %.o ; ../tools/build-proof.sh "$<" "$(HOLLIGHT)" "$@"
+
+# Run them and print the standard output+error at *.correct
+
+%.correct: %.native ; ../tools/run-proof.sh "$<" "$@"
+
+# Cases where a proof uses other proofs for lemmas and/or subroutines
+
+p256/bignum_montmul_p256.native: p256/unopt/bignum_montmul_p256_base.o
+p384/bignum_montmul_p384.native: p384/unopt/bignum_montmul_p384_base.o
+p521/bignum_montmul_p521.native: p521/unopt/bignum_montmul_p521_base.o
+p256/bignum_montsqr_p256.native: p256/unopt/bignum_montsqr_p256_base.o
+p384/bignum_montsqr_p384.native: p384/unopt/bignum_montsqr_p384_base.o
+p521/bignum_montsqr_p521.native: p521/unopt/bignum_montsqr_p521_base.o
+p521/bignum_mul_p521.native: p521/unopt/bignum_mul_p521_base.o
+p521/bignum_sqr_p521.native: p521/unopt/bignum_sqr_p521_base.o
+fastmul/bignum_emontredc_8n_cdiff.native: fastmul/unopt/bignum_emontredc_8n_base.o fastmul/unopt/bignum_emontredc_8n_cdiff_base.o
+fastmul/bignum_mul_8_16.native: fastmul/unopt/bignum_mul_8_16_base.o
+fastmul/bignum_sqr_8_16.native: fastmul/unopt/bignum_sqr_8_16_base.o
+curve25519/curve25519_x25519.native: curve25519/bignum_inv_p25519.native
+curve25519/curve25519_x25519_alt.native: curve25519/bignum_inv_p25519.native
+curve25519/curve25519_x25519_byte.native: curve25519/bignum_inv_p25519.native
+curve25519/curve25519_x25519_byte_alt.native: curve25519/bignum_inv_p25519.native
+curve25519/curve25519_x25519base.native: curve25519/bignum_inv_p25519.native
+curve25519/curve25519_x25519base_alt.native: curve25519/bignum_inv_p25519.native
+curve25519/curve25519_x25519base_byte.native: curve25519/bignum_inv_p25519.native
+curve25519/curve25519_x25519base_byte_alt.native: curve25519/bignum_inv_p25519.native
+curve25519/edwards25519_scalarmulbase.native: curve25519/bignum_inv_p25519.native
+curve25519/edwards25519_scalarmulbase_alt.native: curve25519/bignum_inv_p25519.native
+curve25519/edwards25519_scalarmuldouble.native: curve25519/bignum_inv_p25519.native
+curve25519/edwards25519_scalarmuldouble_alt.native: curve25519/bignum_inv_p25519.native
+generic/bignum_modexp.native: generic/bignum_amontifier.native generic/bignum_amontmul.native generic/bignum_demont.native generic/bignum_mux.native
+p256/p256_montjadd.native: p256/unopt/p256_montjadd.o p256/bignum_montsqr_p256.native p256/bignum_montmul_p256.native p256/bignum_sub_p256.native
+p256/p256_montjdouble.native: p256/unopt/p256_montjdouble.o p256/bignum_montsqr_p256.native p256/bignum_montmul_p256.native p256/bignum_sub_p256.native p256/bignum_add_p256.native
+p256/p256_montjscalarmul.native: p256/p256_montjadd.native p256/p256_montjdouble.native
+p256/p256_montjscalarmul_alt.native: p256/p256_montjadd_alt.native p256/p256_montjdouble_alt.native
+p256/p256_scalarmul.native: p256/bignum_demont_p256.native p256/bignum_inv_p256.native p256/bignum_tomont_p256.native p256/p256_montjadd.native p256/p256_montjdouble.native p256/p256_montjmixadd.native
+p256/p256_scalarmul_alt.native: p256/bignum_demont_p256.native p256/bignum_inv_p256.native p256/p256_montjadd_alt.native p256/p256_montjdouble_alt.native p256/p256_montjmixadd_alt.native
+p256/p256_scalarmulbase.native: p256/bignum_demont_p256.native p256/bignum_inv_p256.native p256/p256_montjmixadd.native
+p256/p256_scalarmulbase_alt.native: p256/bignum_demont_p256.native p256/bignum_inv_p256.native p256/p256_montjmixadd_alt.native
+p384/p384_montjadd.native: p384/unopt/p384_montjadd.o p384/bignum_montsqr_p384.native p384/bignum_montmul_p384.native p384/bignum_sub_p384.native
+p384/p384_montjdouble.native: p384/unopt/p384_montjdouble.o p384/bignum_montsqr_p384.native p384/bignum_montmul_p384.native p384/bignum_sub_p384.native p384/bignum_add_p384.native
+p384/p384_montjscalarmul.native: \
+    p384/p384_montjadd.native p384/p384_montjdouble.native \
+    p384/bignum_sub_p384.native p384/bignum_add_p384.native
+p384/p384_montjscalarmul_alt.native: p384/p384_montjadd_alt.native p384/p384_montjdouble_alt.native
+p521/p521_jadd.native: p521/bignum_mul_p521.native p521/bignum_sqr_p521.native
+p521/p521_jdouble.native: p521/bignum_mul_p521.native p521/bignum_sqr_p521.native
+p521/p521_jscalarmul.native: p521/bignum_mod_n521_9.native p521/p521_jadd.native p521/p521_jdouble.native
+p521/p521_jscalarmul_alt.native: p521/bignum_mod_n521_9.native
+sm2/sm2_montjscalarmul.native: sm2/sm2_montjadd.native sm2/sm2_montjdouble.native
+sm2/sm2_montjscalarmul_alt.native: sm2/sm2_montjadd_alt.native sm2/sm2_montjdouble_alt.native
+
+# Tutorial
+
+.SECONDEXPANSION:
+tutorial/%.native: tutorial/%.ml tutorial/%.o ; ../tools/build-proof.sh "$<" "$(HOLLIGHT)" "$@"
+# Additional dependencies on .o files
+tutorial/rel_loop.native: tutorial/rel_loop2.o
+tutorial/rel_simp.native: tutorial/rel_simp2.o
+tutorial/rel_veceq.native: tutorial/rel_veceq2.o
+tutorial/rel_equivtac.native: tutorial/rel_equivtac2.o
+tutorial/rel_reordertac.native: tutorial/rel_reordertac2.o
+
+
+unopt: $(UNOPT_OBJ)
+
+build_proofs: $(UNOPT_OBJ) $(PROOF_BINS)
+# Conservatively check that there is no redefinition of "check_axioms"
+# '-I' excludes binary files (*.native).
+	! grep -RI "check_axioms" . ../common/ --exclude="Makefile"
+build_tutorial: $(TUTORIAL_OBJ) $(TUTORIAL_PROOF_BINS);
+run_proofs: build_proofs $(PROOF_LOGS);
+
+proofs: run_proofs ; ../tools/count-proofs.sh .
+tutorial: build_tutorial $(TUTORIAL_PROOF_LOGS);
+
+# Always run sematest regardless of dependency check
+FORCE: ;
+# Always use max. # of cores because in Makefile one cannot get the passed number of -j.
+# A portable way of getting the number of max. cores:
+# https://stackoverflow.com/a/23569003/1488216
+NUM_CORES_FOR_SEMATEST = $(shell getconf _NPROCESSORS_ONLN)
+sematest: FORCE $(OBJ) proofs/simulator_iclasses.ml proofs/simulator.native
+	../tools/run-sematest.sh arm $(NUM_CORES_FOR_SEMATEST)
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/allowed_asm b/third_party/s2n-bignum/s2n-bignum-imported/arm/allowed_asm
new file mode 100644
index 00000000000..e511343b85a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/allowed_asm
@@ -0,0 +1,169 @@
+: adc$
+: adcs$
+: add$
+: adds$
+: adr$
+: and$
+: and.16b$
+: ands$
+: asr$
+: b$
+: bcax$
+: bcax.16b$
+: bfi$
+: bic$
+: bic.8h$
+: bit$
+: bit.16b$
+: bl$
+: ccmn$
+: ccmp$
+: cinc$
+: cinv$
+: clz$
+: cmhi$
+: cmhi.8h$
+: cmn$
+: cmp$
+: cnt$
+: cnt.16b$
+: cneg$
+: csel$
+: cset$
+: csetm$
+: dup$
+: dup.2d$
+: eor$
+: eor3$
+: eor3.16b$
+: ext$
+: ext.16b$
+: extr$
+: fcsel$
+: fmov$
+: ld1r$
+: ld1r.2d$
+: ldp$
+: ldr$
+: ldrb$
+: ldur$
+: lsl$
+: lsr$
+: madd$
+: mls$
+: mls.2s$
+: mls.8h$
+: mneg$
+: mov$
+: mov.d$
+: movi$
+: movi.2d$
+: movk$
+: msub$
+: mul$
+: mul.4s$
+: mvn$
+: neg$
+: negs$
+: ngc$
+: ngcs$
+: orr$
+: rax1$
+: rax1.2d$
+: ret$
+: rev64$
+: rev64.4s$
+: ror$
+: sbc$
+: sbcs$
+: sbfx$
+: shl$
+: shl.2d$
+: shrn$
+: shrn.2s$
+: sli$
+: sli.2d$
+: smlal$
+: smlal.2d$
+: smlal2$
+: smlal2.2d$
+: smlsl$
+: smlsl.2d$
+: smlsl2$
+: smlsl2.2d$
+: smulh$
+: smull$
+: smull.2d$
+: smull2$
+: smull2.2d$
+: sqdmulh$
+: sqdmulh.4s$
+: sqdmulh.8h$
+: sqdmulh.s$
+: sqrdmulh$
+: sqrdmulh.2s$
+: sqrdmulh.4s$
+: sqrdmulh.8h$
+: sri$
+: sri.2d$
+: sri.4h$
+: srshr$
+: srshr.2d$
+: srshr.8h$
+: sshr$
+: sshr.8h$
+: stp$
+: str$
+: strb$
+: stur$
+: sub$
+: subs$
+: trn1$
+: trn1.16b$
+: trn1.2d$
+: trn1.2s$
+: trn1.4s$
+: trn2$
+: trn2.2d$
+: trn2.2s$
+: trn2.4s$
+: tst$
+: uaddlp$
+: uaddlp.2d$
+: uaddlv$
+: uaddlv.8h$
+: ubfx$
+: umaddl$
+: umlal$
+: umlal.2d$
+: umlal2$
+: umlal2.2d$
+: umlsl$
+: umlsl.2d$
+: umlsl2$
+: umlsl2.2d$
+: umulh$
+: umull$
+: umull.2d$
+: umull2$
+: umull2.2d$
+: ushr$
+: ushr.2d$
+: ushr.8h$
+: usra$
+: usra.2d$
+: uzp1$
+: uzp1.4s$
+: uzp2$
+: uzp2.4s$
+: xar$
+: xar.2d$
+: xtn$
+: xtn.2s$
+: zip1$
+: zip1.2s$
+: zip1.4s$
+: zip2$
+: zip2.2s$
+: zip2.4s$
+: $
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/Makefile
new file mode 100644
index 00000000000..b22696783d4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/Makefile
@@ -0,0 +1,77 @@
+#############################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+#############################################################################
+
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
+# use a cross-assembling version so that the code can still be assembled
+# and the proofs checked against the object files (though you won't be able
+# to run code without additional emulation infrastructure). The aarch64
+# cross-assembling version can be installed manually by something like:
+#
+#  sudo apt-get install binutils-aarch64-linux-gnu
+
+UNAME_RESULT=$(shell uname -p)
+
+ifeq ($(UNAME_RESULT),aarch64)
+GAS=as
+else
+GAS=aarch64-linux-gnu-as
+endif
+
+# List of object files
+
+OBJ = bignum_add_p25519.o \
+      bignum_cmul_p25519.o \
+      bignum_double_p25519.o \
+      bignum_inv_p25519.o \
+      bignum_invsqrt_p25519.o \
+      bignum_invsqrt_p25519_alt.o \
+      bignum_madd_n25519.o \
+      bignum_madd_n25519_alt.o \
+      bignum_mod_m25519_4.o \
+      bignum_mod_n25519.o \
+      bignum_mod_n25519_4.o \
+      bignum_mod_p25519_4.o \
+      bignum_mul_p25519.o \
+      bignum_mul_p25519_alt.o \
+      bignum_neg_p25519.o \
+      bignum_optneg_p25519.o \
+      bignum_sqr_p25519.o \
+      bignum_sqr_p25519_alt.o \
+      bignum_sqrt_p25519.o \
+      bignum_sqrt_p25519_alt.o \
+      bignum_sub_p25519.o \
+      curve25519_ladderstep.o \
+      curve25519_ladderstep_alt.o \
+      curve25519_pxscalarmul.o \
+      curve25519_pxscalarmul_alt.o \
+      curve25519_x25519.o \
+      curve25519_x25519_alt.o \
+      curve25519_x25519_byte.o \
+      curve25519_x25519_byte_alt.o \
+      curve25519_x25519base.o \
+      curve25519_x25519base_alt.o \
+      curve25519_x25519base_byte.o \
+      curve25519_x25519base_byte_alt.o \
+      edwards25519_decode.o \
+      edwards25519_decode_alt.o \
+      edwards25519_encode.o \
+      edwards25519_epadd.o \
+      edwards25519_epadd_alt.o \
+      edwards25519_epdouble.o \
+      edwards25519_epdouble_alt.o \
+      edwards25519_pdouble.o \
+      edwards25519_pdouble_alt.o \
+      edwards25519_pepadd.o \
+      edwards25519_pepadd_alt.o \
+      edwards25519_scalarmulbase.o \
+      edwards25519_scalarmulbase_alt.o \
+      edwards25519_scalarmuldouble.o \
+      edwards25519_scalarmuldouble_alt.o
+
+%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ -
+
+default: $(OBJ);
+
+clean:; rm -f *.o *.correct
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_add_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_add_p25519.S
new file mode 100644
index 00000000000..9a538925f3c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_add_p25519.S
@@ -0,0 +1,74 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_add_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p25519)
+
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define c0 x7
+#define c1 x8
+#define c2 x9
+#define c3 x10
+
+S2N_BN_SYMBOL(bignum_add_p25519):
+
+// Add as [d3; d2; d1; d0] = x + y; since we assume x, y < 2^255 - 19
+// this sum fits in 256 bits
+
+        ldp     d0, d1, [x]
+        ldp     c0, c1, [y]
+        adds    d0, d0, c0
+        adcs    d1, d1, c1
+        ldp     d2, d3, [x, #16]
+        ldp     c0, c1, [y, #16]
+        adcs    d2, d2, c0
+        adc     d3, d3, c1
+
+// Now x + y >= 2^255 - 19 <=> x + y + (2^255 + 19) >= 2^256
+// Form [c3; c2; c1; c0] = (x + y) + (2^255 + 19), with CF for the comparison
+
+        mov     c3, #0x8000000000000000
+        adds    c0, d0, #19
+        adcs    c1, d1, xzr
+        adcs    c2, d2, xzr
+        adcs    c3, d3, c3
+
+// If the comparison holds, select [c3; c2; c1; c0]. There's no need to mask
+// it since in this case it is ((x + y) + (2^255 + 19)) - 2^256 because the
+// top carry is lost, which is the desired (x + y) - (2^255 - 19).
+
+        csel    d0, d0, c0, cc
+        csel    d1, d1, c1, cc
+        csel    d2, d2, c2, cc
+        csel    d3, d3, c3, cc
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_cmul_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_cmul_p25519.S
new file mode 100644
index 00000000000..883007c6c39
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_cmul_p25519.S
@@ -0,0 +1,99 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_p25519
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = c, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p25519)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p25519_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define m x1
+#define x x2
+
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+
+#define a0 x7
+#define a1 x8
+#define a2 x9
+#define a3 x10
+
+// Aliased to the a0,..,a3 when they are no longer needed
+
+#define l x7
+#define q x8
+#define c x9
+#define d4 x10
+#define h x10
+
+S2N_BN_SYMBOL(bignum_cmul_p25519):
+S2N_BN_SYMBOL(bignum_cmul_p25519_alt):
+
+// First do the multiply, straightforwardly to get [d4;d3;d2;d1;d0]
+
+        ldp     a0, a1, [x]
+        ldp     a2, a3, [x, #16]
+        mul     d0, m, a0
+        mul     d1, m, a1
+        mul     d2, m, a2
+        mul     d3, m, a3
+        umulh   a0, m, a0
+        umulh   a1, m, a1
+        umulh   a2, m, a2
+        umulh   d4, m, a3
+        adds    d1, d1, a0
+        adcs    d2, d2, a1
+        adcs    d3, d3, a2
+        adcs    d4, d4, xzr
+
+// Let 2^255 * h + l = [d4,d3,d2,d1,d0] = c * x, and use q = h + 1
+// as the initial quotient estimate, either right or 1 too big.
+
+        add     q, d4, 1
+        adds    xzr, d3, d3
+        orr     d3, d3, #0x8000000000000000
+        adc     q, q, d4
+        mov     c, #19
+        mul     l, q, c
+        umulh   h, q, c
+        adds    d0, d0, l
+        adcs    d1, d1, h
+        adcs    d2, d2, xzr
+        adcs    d3, d3, xzr
+
+// Correct if CF = 0 by subtracting 19, either way masking to
+// 255 bits, i.e. by effectively adding p_25519 to the "full" answer
+
+        csel    c, c, xzr, cc
+        subs    d0, d0, c
+        sbcs    d1, d1, xzr
+        sbcs    d2, d2, xzr
+        sbc     d3, d3, xzr
+        and     d3, d3, #~0x8000000000000000
+
+// Finally store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_double_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_double_p25519.S
new file mode 100644
index 00000000000..b2772a56a1f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_double_p25519.S
@@ -0,0 +1,71 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_double_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p25519)
+
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define c0 x6
+#define c1 x7
+#define c2 x8
+#define c3 x9
+
+S2N_BN_SYMBOL(bignum_double_p25519):
+
+// Double by adding as [d3; d2; d1; d0] = 2 * x; since we assume
+// x < 2^255 - 19 this result fits in 256 bits
+
+        ldp     d0, d1, [x]
+        adds    d0, d0, d0
+        adcs    d1, d1, d1
+        ldp     d2, d3, [x, #16]
+        adcs    d2, d2, d2
+        adc     d3, d3, d3
+
+// Now 2 * x >= 2^255 - 19 <=> 2 * x + (2^255 + 19) >= 2^256
+// Form [c3; c2; c1; c0] = (2 * x) + (2^255 + 19), with CF for the comparison
+
+        mov     c3, #0x8000000000000000
+        adds    c0, d0, #19
+        adcs    c1, d1, xzr
+        adcs    c2, d2, xzr
+        adcs    c3, d3, c3
+
+// If the comparison holds, select [c3; c2; c1; c0]. There's no need to mask
+// it since in this case it is ((2 * x) + (2^255 + 19)) - 2^256 because the
+// top carry is lost, which is the desired (2 * x) - (2^255 - 19).
+
+        csel    d0, d0, c0, cc
+        csel    d1, d1, c1, cc
+        csel    d2, d2, c2, cc
+        csel    d3, d3, c3, cc
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_inv_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_inv_p25519.S
new file mode 100644
index 00000000000..d45273a83b0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_inv_p25519.S
@@ -0,0 +1,1255 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular inverse modulo p_25519 = 2^255 - 19
+// Input x[4]; output z[4]
+//
+// extern void bignum_inv_p25519(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Assuming the 4-digit input x is coprime to p_25519, i.e. is not divisible
+// by it, returns z < p_25519 such that x * z == 1 (mod p_25519). Note that
+// x does not need to be reduced modulo p_25519, but the output always is.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p25519)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Used for the return pointer
+
+#define res x20
+
+// Loop counter and d = 2 * delta value for divstep
+
+#define i x21
+#define d x22
+
+// Registers used for matrix element magnitudes and signs
+
+#define m00 x10
+#define m01 x11
+#define m10 x12
+#define m11 x13
+#define s00 x14
+#define s01 x15
+#define s10 x16
+#define s11 x17
+
+// Initial carries for combinations
+
+#define car0 x9
+#define car1 x19
+
+// Input and output, plain registers treated according to pattern
+
+#define reg0 x0, #0
+#define reg1 x1, #0
+#define reg2 x2, #0
+#define reg3 x3, #0
+#define reg4 x4, #0
+
+#define x x1, #0
+#define z x0, #0
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f sp, #0
+#define g sp, #(4*N)
+#define u sp, #(8*N)
+#define v sp, #(12*N)
+
+// Total size to reserve on the stack
+
+#define NSPACE #(16*N)
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix in
+// registers as follows
+//
+// [ m00  m01]
+// [ m10  m11]
+
+#define divstep59()                                                     \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x8, x4, #0x100, lsl #12 __LF                               \
+        sbfx    x8, x8, #21, #21 __LF                                      \
+        mov     x11, #0x100000 __LF                                        \
+        add     x11, x11, x11, lsl #21 __LF                                \
+        add     x9, x4, x11 __LF                                           \
+        asr     x9, x9, #42 __LF                                           \
+        add     x10, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x10, x10, #21, #21 __LF                                    \
+        add     x11, x5, x11 __LF                                          \
+        asr     x11, x11, #42 __LF                                         \
+        mul     x6, x8, x2 __LF                                            \
+        mul     x7, x9, x3 __LF                                            \
+        mul     x2, x10, x2 __LF                                           \
+        mul     x3, x11, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #21, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #42 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #21, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #42 __LF                                         \
+        mul     x6, x12, x2 __LF                                           \
+        mul     x7, x13, x3 __LF                                           \
+        mul     x2, x14, x2 __LF                                           \
+        mul     x3, x15, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        mul     x2, x12, x8 __LF                                           \
+        mul     x3, x12, x9 __LF                                           \
+        mul     x6, x14, x8 __LF                                           \
+        mul     x7, x14, x9 __LF                                           \
+        madd    x8, x13, x10, x2 __LF                                      \
+        madd    x9, x13, x11, x3 __LF                                      \
+        madd    x16, x15, x10, x6 __LF                                     \
+        madd    x17, x15, x11, x7 __LF                                     \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #22, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #43 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #22, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #43 __LF                                         \
+        mneg    x2, x12, x8 __LF                                           \
+        mneg    x3, x12, x9 __LF                                           \
+        mneg    x4, x14, x8 __LF                                           \
+        mneg    x5, x14, x9 __LF                                           \
+        msub    m00, x13, x16, x2 __LF                                     \
+        msub    m01, x13, x17, x3 __LF                                     \
+        msub    m10, x15, x16, x4 __LF                                     \
+        msub    m11, x15, x17, x5
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_inv_p25519):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        stp     x21, x22, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Copy the input and the prime into the main f and g variables.
+// Make sure x is reduced so that g <= f as assumed in the bound proof.
+
+        mov     x10, #-19
+        mov     x11, #-1
+        stp     x10, x11, [f]
+        mov     x12, #0x7FFFFFFFFFFFFFFF
+        stp     x11, x12, [f+2*N]
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #(2*N)]
+        mov     x7, #19
+        lsr     x6, x5, #63
+        madd    x6, x7, x6, x7
+        adds    x2, x2, x6
+        adcs    x3, x3, xzr
+        adcs    x4, x4, xzr
+        orr     x5, x5, #0x8000000000000000
+        adcs    x5, x5, xzr
+        csel    x6, x7, xzr, cc
+        subs    x2, x2, x6
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, xzr
+        sbc     x5, x5, xzr
+        and     x5, x5, #0x7fffffffffffffff
+        stp     x2, x3, [g]
+        stp     x4, x5, [g+2*N]
+
+// Also maintain weakly reduced < 2*p_25519 vector [u,v] such that
+// [f,g] == x * 2^{590-59*i} * [u,v] (mod p_25519)
+// starting with [p_25519,x] == x * 2^{590-59*0} * [0,2^-590] (mod p_25519)
+
+        stp     xzr, xzr, [u]
+        stp     xzr, xzr, [u+2*N]
+
+        movbig(x10, 0xa0f9, 0x9e23, 0x7502, 0x2099)
+        movbig(x11, 0xa8c6, 0x8f3f, 0x1d13, 0x2595)
+        movbig(x12, 0x6c6c, 0x8938, 0x05ac, 0x5242)
+        movbig(x13, 0x2765, 0x08b2, 0x4177, 0x0615)
+
+        stp     x10, x11, [v]
+        stp     x12, x13, [v+2*N]
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special tenth iteration after a uniform
+// first 9.
+
+        mov     i, #10
+        mov     d, #1
+        b       bignum_inv_p25519_midloop
+
+bignum_inv_p25519_loop:
+
+// Separate the matrix elements into sign-magnitude pairs
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in stable registers for the [u,v] part and do [f,g] first.
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+        and     x0, m10, s10
+        and     x1, m11, s11
+        add     car1, x0, x1
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        ldr     x7, [f]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [g]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Digit 1 of [f,g]
+
+        ldr     x7, [f+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [g+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [f]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [g]
+
+// Digit 2 of [f,g]
+
+        ldr     x7, [f+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [g+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [f+N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [g+N]
+
+// Digits 3 and 4 of [f,g]
+
+        ldr     x7, [f+3*N]
+        eor     x1, x7, s00
+        asr     x3, x1, #63
+        and     x3, x3, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [g+3*N]
+        eor     x1, x8, s01
+        asr     x0, x1, #63
+        and     x0, x0, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [f+2*N]
+        extr    x5, x3, x5, #59
+        str     x5, [f+3*N]
+
+        eor     x1, x7, s10
+        asr     x5, x1, #63
+        and     x5, x5, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        asr     x0, x1, #63
+        and     x0, x0, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [g+2*N]
+        extr    x2, x5, x2, #59
+        str     x2, [g+3*N]
+
+// Now the computation of the updated u and v values and their
+// modular reductions. A very similar accumulation except that
+// the top words of u and v are unsigned and we don't shift.
+//
+// Digit 0 of [u,v]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v]
+        adc     x3, x3, x1
+
+// Digit 1 of [u,v]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        str     x3, [v+N]
+        adc     x4, x4, x1
+
+// Digit 2 of [u,v]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        str     x4, [v+2*N]
+        adc     x2, x2, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Modular reduction of u
+
+        extr    x6, x3, x5, #63
+        ldp     x0, x1, [u]
+        add     x6, x6, x3, asr #63
+        mov     x3, #19
+        mul     x4, x6, x3
+        add     x5, x5, x6, lsl #63
+        smulh   x3, x6, x3
+        ldr     x6, [u+2*N]
+        adds    x0, x0, x4
+        adcs    x1, x1, x3
+        asr     x3, x3, #63
+        adcs    x6, x6, x3
+        adc     x5, x5, x3
+        stp     x0, x1, [u]
+        stp     x6, x5, [u+16]
+
+// Digits 3 and 4 of v (top is unsigned)
+
+        eor     x1, x7, s10
+        and     x5, s10, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        and     x0, s11, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+
+// Modular reduction of v
+
+        extr    x6, x5, x2, #63
+        ldp     x0, x1, [v]
+        add     x6, x6, x5, asr #63
+        mov     x5, #19
+        mul     x4, x6, x5
+        add     x2, x2, x6, lsl #63
+        smulh   x5, x6, x5
+        ldr     x3, [v+2*N]
+        adds    x0, x0, x4
+        adcs    x1, x1, x5
+        asr     x5, x5, #63
+        adcs    x3, x3, x5
+        adc     x2, x2, x5
+        stp     x0, x1, [v]
+        stp     x3, x2, [v+16]
+
+bignum_inv_p25519_midloop:
+
+        mov     x1, d
+        ldr     x2, [f]
+        ldr     x3, [g]
+        divstep59()
+        mov     d, x1
+
+// Next iteration
+
+        subs    i, i, #1
+        bne     bignum_inv_p25519_loop
+
+// The 10th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        ldr     x0, [f]
+        ldr     x1, [g]
+        mul     x0, x0, m00
+        madd    x1, x1, m01, x0
+        asr     x0, x1, #63
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * [u,v] (mod p_25519)
+// we want to flip the sign of u according to that of f.
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+        eor     s00, s00, x0
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+        eor     s01, s01, x0
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+        eor     s10, s10, x0
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+        eor     s11, s11, x0
+
+// Adjust the initial value to allow for complement instead of negation
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+// Digit 0 of [u]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+// Digit 1 of [u]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+// Digit 2 of [u]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Modular reduction of u, this time strictly 2^255-19.
+
+        extr    x6, x3, x5, #63
+        ldp     x0, x1, [u]
+        tst     x3, x3
+        cinc    x6, x6, pl
+        mov     x3, #19
+        mul     x4, x6, x3
+        add     x5, x5, x6, lsl #63
+        smulh   x6, x6, x3
+        ldr     x2, [u+2*N]
+        adds    x0, x0, x4
+        adcs    x1, x1, x6
+        asr     x6, x6, #63
+        adcs    x2, x2, x6
+        adcs    x5, x5, x6
+        csel    x3, x3, xzr, mi
+        subs    x0, x0, x3
+        sbcs    x1, x1, xzr
+        sbcs    x2, x2, xzr
+        sbc     x5, x5, xzr
+        and     x5, x5, #0x7fffffffffffffff
+
+// Store it back to the final output
+
+        mov     x4, res
+        stp     x0, x1, [x4]
+        stp     x2, x5, [x4, #16]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519.S
new file mode 100644
index 00000000000..50d774d1f58
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519.S
@@ -0,0 +1,600 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Inverse square root modulo p_25519 = 2^255 - 19
+// Input x[4]; output function return (Legendre symbol) and z[4]
+//
+// extern int64_t bignum_invsqrt_p25519(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Given a 4-digit input x, returns a modular inverse square root mod p_25519,
+// i.e. a z such that x * z^2 == 1 (mod p_25519), whenever one exists. The
+// inverse square root z is chosen so that its LSB is even (note that p_25519-z
+// is another possibility). The function return is the Legendre/Jacobi symbol
+// (x//p_25519), which indicates whether indeed x has a modular inverse square
+// root and hence whether the result is meaningful:
+//
+//   0: x is divisible by p_25519 so trivially there is no inverse square root
+//  +1: x is coprime to p_25519 and z is indeed an inverse square root
+//  -1: x is coprime to p_25519 but there is no (inverse or direct) square root
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_invsqrt_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_invsqrt_p25519)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define a sp, #0
+#define b sp, #(4*N)
+#define s sp, #(8*N)
+#define t sp, #(12*N)
+
+// Other temporary variables in register
+
+#define res x19
+
+// Total size to reserve on the stack
+
+#define NSPACE #(16*N)
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+// Macros wrapping up calls to the local subroutines
+
+#define mulp(dest,src1,src2)                                            \
+        add     x0, dest __LF                                              \
+        add     x1, src1 __LF                                              \
+        add     x2, src2 __LF                                              \
+        bl      bignum_invsqrt_p25519_mul_p25519
+
+#define nsqr(dest,n,src)                                                \
+        add     x0, dest __LF                                              \
+        mov     x1, n __LF                                                 \
+        add     x2, src __LF                                               \
+        bl      bignum_invsqrt_p25519_nsqr_p25519
+
+S2N_BN_SYMBOL(bignum_invsqrt_p25519):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x30, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Set up reduced version of the input argument a = x mod p_25519. Then
+// get the candidate inverse square root s = a^{252-3}
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        mov     x7, #19
+        lsr     x6, x5, #63
+        madd    x6, x7, x6, x7
+        adds    x2, x2, x6
+        adcs    x3, x3, xzr
+        adcs    x4, x4, xzr
+        orr     x5, x5, #0x8000000000000000
+        adcs    x5, x5, xzr
+        csel    x6, x7, xzr, lo
+        subs    x2, x2, x6
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, xzr
+        sbc     x5, x5, xzr
+        and     x5, x5, #0x7fffffffffffffff
+        stp     x2, x3, [a]
+        stp     x4, x5, [a+16]
+
+  // Power 2^2 - 1 = 3
+
+        nsqr(t,1,a)
+        mulp(t,t,a)
+
+  // Power 2^4 - 1 = 15
+
+        nsqr(s,2,t)
+        mulp(t,s,t)
+
+  // Power 2^5 - 1 = 31
+
+        nsqr(s,1,t)
+        mulp(b,s,a)
+
+  // Power 2^10 - 1
+
+        nsqr(s,5,b)
+        mulp(t,s,b)
+
+  // Power 2^20 - 1
+
+        nsqr(s,10,t)
+        mulp(t,s,t)
+
+  // Power 2^25 - 1
+
+        nsqr(s,5,t)
+        mulp(b,s,b)
+
+  // Power 2^50 - 1
+
+        nsqr(s,25,b)
+        mulp(t,s,b)
+
+  // Power 2^100 - 1
+        nsqr(s,50,t)
+        mulp(t,s,t)
+
+  // Power 2^125 - 1
+
+        nsqr(s,25,t)
+        mulp(b,s,b)
+
+  // Power 2^250 - 1
+
+        nsqr(s,125,b)
+        mulp(b,s,b)
+
+  // Power 2^252 - 3
+
+        nsqr(s,2,b)
+        mulp(s,s,a)
+
+// s = a^{2^252-3} is now one candidate inverse square root.
+// Generate the other one t = s * j_25519 where j_25519 = sqrt(-1)
+
+        movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0)
+        movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478)
+        movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7)
+        movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b)
+        stp     x0, x1, [t]
+        stp     x2, x3, [t+16]
+        mulp(t,s,t)
+
+// Now multiplex between them according to whether a * s^2 = 1
+
+        nsqr(b,1,s)
+        mulp(b,a,b)
+
+        ldp     x10, x11, [b]
+        eor     x10, x10, #1
+        ldp     x12, x13, [b+16]
+        orr     x10, x10, x11
+        orr     x12, x12, x13
+        orr     x10, x10, x12
+        cmp     x10, xzr
+
+        ldp     x10, x11, [s]
+        ldp     x14, x15, [t]
+        csel    x10, x10, x14, eq
+        csel    x11, x11, x15, eq
+        ldp     x12, x13, [s+16]
+        ldp     x16, x17, [t+16]
+        csel    x12, x12, x16, eq
+        csel    x13, x13, x17, eq
+
+// For definiteness, choose "positive" (LSB=0) inverse square root
+
+        mov     x14, #-19
+        subs    x14, x14, x10
+        mov     x16, #-1
+        sbcs    x15, x16, x11
+        mov     x17, #0x7FFFFFFFFFFFFFFF
+        sbcs    x16, x16, x12
+        sbc     x17, x17, x13
+
+        tst     x10, #1
+        csel    x10, x10, x14, eq
+        csel    x11, x11, x15, eq
+        csel    x12, x12, x16, eq
+        csel    x13, x13, x17, eq
+
+        mov     x2, res
+        stp     x10, x11, [x2]
+        stp     x12, x13, [x2, #16]
+
+// Determine if it is is indeed an inverse square root, also distinguishing
+// the degenerate x * z^2 == 0 (mod p_25519) case, which is equivalent to
+// x == 0 (mod p_25519). Hence return the Legendre-Jacobi symbol as required.
+
+        add     x0, b
+        mov     x1, #1
+        bl      bignum_invsqrt_p25519_nsqr_p25519
+        mulp(b,a,b)
+
+        ldp     x10, x11, [b]
+        eor     x14, x10, #1
+        ldp     x12, x13, [b+16]
+        orr     x10, x10, x11
+        orr     x14, x14, x11
+        orr     x12, x12, x13
+        orr     x10, x10, x12
+        orr     x14, x14, x12
+
+        cmp     x14, xzr
+        mov     x0, #1
+        cneg    x0, x0, ne
+
+        cmp     x10, xzr
+        csel    x0, x0, xzr, ne
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x19, x30, [sp], 16
+        ret
+
+// *************************************************************
+// Local z = x * y
+// *************************************************************
+
+bignum_invsqrt_p25519_mul_p25519:
+        ldp     x3, x4, [x1]
+        ldp     x5, x6, [x2]
+        umull   x7, w3, w5
+        lsr     x17, x3, #32
+        umull   x15, w17, w5
+        lsr     x16, x5, #32
+        umull   x8, w16, w17
+        umull   x16, w3, w16
+        adds    x7, x7, x15, lsl #32
+        lsr     x15, x15, #32
+        adc     x8, x8, x15
+        adds    x7, x7, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     x8, x8, x16
+        mul     x9, x4, x6
+        umulh   x10, x4, x6
+        subs    x4, x4, x3
+        cneg    x4, x4, lo
+        csetm   x16, lo
+        adds    x9, x9, x8
+        adc     x10, x10, xzr
+        subs    x3, x5, x6
+        cneg    x3, x3, lo
+        cinv    x16, x16, lo
+        mul     x15, x4, x3
+        umulh   x3, x4, x3
+        adds    x8, x7, x9
+        adcs    x9, x9, x10
+        adc     x10, x10, xzr
+        cmn     x16, #1
+        eor     x15, x15, x16
+        adcs    x8, x15, x8
+        eor     x3, x3, x16
+        adcs    x9, x3, x9
+        adc     x10, x10, x16
+        ldp     x3, x4, [x1, #16]
+        ldp     x5, x6, [x2, #16]
+        umull   x11, w3, w5
+        lsr     x17, x3, #32
+        umull   x15, w17, w5
+        lsr     x16, x5, #32
+        umull   x12, w16, w17
+        umull   x16, w3, w16
+        adds    x11, x11, x15, lsl #32
+        lsr     x15, x15, #32
+        adc     x12, x12, x15
+        adds    x11, x11, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     x12, x12, x16
+        mul     x13, x4, x6
+        umulh   x14, x4, x6
+        subs    x4, x4, x3
+        cneg    x4, x4, lo
+        csetm   x16, lo
+        adds    x13, x13, x12
+        adc     x14, x14, xzr
+        subs    x3, x5, x6
+        cneg    x3, x3, lo
+        cinv    x16, x16, lo
+        mul     x15, x4, x3
+        umulh   x3, x4, x3
+        adds    x12, x11, x13
+        adcs    x13, x13, x14
+        adc     x14, x14, xzr
+        cmn     x16, #1
+        eor     x15, x15, x16
+        adcs    x12, x15, x12
+        eor     x3, x3, x16
+        adcs    x13, x3, x13
+        adc     x14, x14, x16
+        ldp     x3, x4, [x1, #16]
+        ldp     x15, x16, [x1]
+        subs    x3, x3, x15
+        sbcs    x4, x4, x16
+        csetm   x16, lo
+        ldp     x15, x17, [x2]
+        subs    x5, x15, x5
+        sbcs    x6, x17, x6
+        csetm   x17, lo
+        eor     x3, x3, x16
+        subs    x3, x3, x16
+        eor     x4, x4, x16
+        sbc     x4, x4, x16
+        eor     x5, x5, x17
+        subs    x5, x5, x17
+        eor     x6, x6, x17
+        sbc     x6, x6, x17
+        eor     x16, x17, x16
+        adds    x11, x11, x9
+        adcs    x12, x12, x10
+        adcs    x13, x13, xzr
+        adc     x14, x14, xzr
+        mul     x2, x3, x5
+        umulh   x17, x3, x5
+        mul     x15, x4, x6
+        umulh   x1, x4, x6
+        subs    x4, x4, x3
+        cneg    x4, x4, lo
+        csetm   x9, lo
+        adds    x15, x15, x17
+        adc     x1, x1, xzr
+        subs    x6, x5, x6
+        cneg    x6, x6, lo
+        cinv    x9, x9, lo
+        mul     x5, x4, x6
+        umulh   x6, x4, x6
+        adds    x17, x2, x15
+        adcs    x15, x15, x1
+        adc     x1, x1, xzr
+        cmn     x9, #1
+        eor     x5, x5, x9
+        adcs    x17, x5, x17
+        eor     x6, x6, x9
+        adcs    x15, x6, x15
+        adc     x1, x1, x9
+        adds    x9, x11, x7
+        adcs    x10, x12, x8
+        adcs    x11, x13, x11
+        adcs    x12, x14, x12
+        adcs    x13, x13, xzr
+        adc     x14, x14, xzr
+        cmn     x16, #1
+        eor     x2, x2, x16
+        adcs    x9, x2, x9
+        eor     x17, x17, x16
+        adcs    x10, x17, x10
+        eor     x15, x15, x16
+        adcs    x11, x15, x11
+        eor     x1, x1, x16
+        adcs    x12, x1, x12
+        adcs    x13, x13, x16
+        adc     x14, x14, x16
+        mov     x3, #38
+        umull   x4, w11, w3
+        add     x4, x4, w7, uxtw
+        lsr     x7, x7, #32
+        lsr     x11, x11, #32
+        umaddl  x11, w11, w3, x7
+        mov     x7, x4
+        umull   x4, w12, w3
+        add     x4, x4, w8, uxtw
+        lsr     x8, x8, #32
+        lsr     x12, x12, #32
+        umaddl  x12, w12, w3, x8
+        mov     x8, x4
+        umull   x4, w13, w3
+        add     x4, x4, w9, uxtw
+        lsr     x9, x9, #32
+        lsr     x13, x13, #32
+        umaddl  x13, w13, w3, x9
+        mov     x9, x4
+        umull   x4, w14, w3
+        add     x4, x4, w10, uxtw
+        lsr     x10, x10, #32
+        lsr     x14, x14, #32
+        umaddl  x14, w14, w3, x10
+        mov     x10, x4
+        lsr     x17, x14, #31
+        mov     x5, #19
+        umaddl  x5, w5, w17, x5
+        add     x7, x7, x5
+        adds    x7, x7, x11, lsl #32
+        extr    x3, x12, x11, #32
+        adcs    x8, x8, x3
+        extr    x3, x13, x12, #32
+        adcs    x9, x9, x3
+        extr    x3, x14, x13, #32
+        lsl     x5, x17, #63
+        eor     x10, x10, x5
+        adc     x10, x10, x3
+        mov     x3, #19
+        tst     x10, #0x8000000000000000
+        csel    x3, x3, xzr, pl
+        subs    x7, x7, x3
+        sbcs    x8, x8, xzr
+        sbcs    x9, x9, xzr
+        sbc     x10, x10, xzr
+        and     x10, x10, #0x7fffffffffffffff
+        stp     x7, x8, [x0]
+        stp     x9, x10, [x0, #16]
+        ret
+
+// *************************************************************
+// Local z = 2^n * x
+// *************************************************************
+
+bignum_invsqrt_p25519_nsqr_p25519:
+
+// Copy input argument into [x13;x12;x11;x10]
+
+        ldp     x10, x11, [x2]
+        ldp     x12, x13, [x2, #16]
+
+// Main squaring loop, accumulating in [x13;x12;x11;x10] consistently and
+// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38
+
+bignum_invsqrt_p25519_loop:
+        umull   x2, w10, w10
+        lsr     x14, x10, #32
+        umull   x3, w14, w14
+        umull   x14, w10, w14
+        adds    x2, x2, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x3, x3, x14
+        umull   x4, w11, w11
+        lsr     x14, x11, #32
+        umull   x5, w14, w14
+        umull   x14, w11, w14
+        mul     x15, x10, x11
+        umulh   x16, x10, x11
+        adds    x4, x4, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x5, x5, x14
+        adds    x15, x15, x15
+        adcs    x16, x16, x16
+        adc     x5, x5, xzr
+        adds    x3, x3, x15
+        adcs    x4, x4, x16
+        adc     x5, x5, xzr
+        umull   x6, w12, w12
+        lsr     x14, x12, #32
+        umull   x7, w14, w14
+        umull   x14, w12, w14
+        adds    x6, x6, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x7, x7, x14
+        umull   x8, w13, w13
+        lsr     x14, x13, #32
+        umull   x9, w14, w14
+        umull   x14, w13, w14
+        mul     x15, x12, x13
+        umulh   x16, x12, x13
+        adds    x8, x8, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x9, x9, x14
+        adds    x15, x15, x15
+        adcs    x16, x16, x16
+        adc     x9, x9, xzr
+        adds    x7, x7, x15
+        adcs    x8, x8, x16
+        adc     x9, x9, xzr
+        subs    x10, x10, x12
+        sbcs    x11, x11, x13
+        csetm   x16, lo
+        eor     x10, x10, x16
+        subs    x10, x10, x16
+        eor     x11, x11, x16
+        sbc     x11, x11, x16
+        adds    x6, x6, x4
+        adcs    x7, x7, x5
+        adcs    x8, x8, xzr
+        adc     x9, x9, xzr
+        umull   x12, w10, w10
+        lsr     x5, x10, #32
+        umull   x13, w5, w5
+        umull   x5, w10, w5
+        adds    x12, x12, x5, lsl #33
+        lsr     x5, x5, #31
+        adc     x13, x13, x5
+        umull   x15, w11, w11
+        lsr     x5, x11, #32
+        umull   x14, w5, w5
+        umull   x5, w11, w5
+        mul     x4, x10, x11
+        umulh   x16, x10, x11
+        adds    x15, x15, x5, lsl #33
+        lsr     x5, x5, #31
+        adc     x14, x14, x5
+        adds    x4, x4, x4
+        adcs    x16, x16, x16
+        adc     x14, x14, xzr
+        adds    x13, x13, x4
+        adcs    x15, x15, x16
+        adc     x14, x14, xzr
+        adds    x4, x2, x6
+        adcs    x5, x3, x7
+        adcs    x6, x6, x8
+        adcs    x7, x7, x9
+        csetm   x16, lo
+        subs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x15
+        sbcs    x7, x7, x14
+        adcs    x8, x8, x16
+        adc     x9, x9, x16
+        mov     x10, #38
+        umull   x12, w6, w10
+        add     x12, x12, w2, uxtw
+        lsr     x2, x2, #32
+        lsr     x6, x6, #32
+        umaddl  x6, w6, w10, x2
+        mov     x2, x12
+        umull   x12, w7, w10
+        add     x12, x12, w3, uxtw
+        lsr     x3, x3, #32
+        lsr     x7, x7, #32
+        umaddl  x7, w7, w10, x3
+        mov     x3, x12
+        umull   x12, w8, w10
+        add     x12, x12, w4, uxtw
+        lsr     x4, x4, #32
+        lsr     x8, x8, #32
+        umaddl  x8, w8, w10, x4
+        mov     x4, x12
+        umull   x12, w9, w10
+        add     x12, x12, w5, uxtw
+        lsr     x5, x5, #32
+        lsr     x9, x9, #32
+        umaddl  x9, w9, w10, x5
+        mov     x5, x12
+        lsr     x13, x9, #31
+        mov     x11, #19
+        umull   x11, w11, w13
+        add     x2, x2, x11
+        adds    x10, x2, x6, lsl #32
+        extr    x12, x7, x6, #32
+        adcs    x11, x3, x12
+        extr    x12, x8, x7, #32
+        adcs    x12, x4, x12
+        extr    x14, x9, x8, #32
+        lsl     x15, x13, #63
+        eor     x5, x5, x15
+        adc     x13, x5, x14
+
+// Loop as applicable
+
+        subs    x1, x1, #1
+        bne     bignum_invsqrt_p25519_loop
+
+// We know the intermediate result x < 2^256 - 38, and now we do strict
+// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255
+// which is equivalent to a "pl" condition.
+
+        adds    x6, x10, #19
+        adcs    x7, x11, xzr
+        adcs    x8, x12, xzr
+        adcs    x9, x13, xzr
+
+        csel    x10, x10, x6, pl
+        csel    x11, x11, x7, pl
+        csel    x12, x12, x8, pl
+        csel    x13, x13, x9, pl
+        bic     x13, x13, #0x8000000000000000
+
+// Copy result back into destination and return
+
+        stp     x10, x11, [x0]
+        stp     x12, x13, [x0, #16]
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519_alt.S
new file mode 100644
index 00000000000..ad05cdffe18
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519_alt.S
@@ -0,0 +1,463 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Inverse square root modulo p_25519 = 2^255 - 19
+// Input x[4]; output function return (Legendre symbol) and z[4]
+//
+// extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Given a 4-digit input x, returns a modular inverse square root mod p_25519,
+// i.e. a z such that x * z^2 == 1 (mod p_25519), whenever one exists. The
+// inverse square root z is chosen so that its LSB is even (note that p_25519-z
+// is another possibility). The function return is the Legendre/Jacobi symbol
+// (x//p_25519), which indicates whether indeed x has a modular inverse square
+// root and hence whether the result is meaningful:
+//
+//   0: x is divisible by p_25519 so trivially there is no inverse square root
+//  +1: x is coprime to p_25519 and z is indeed an inverse square root
+//  -1: x is coprime to p_25519 but there is no (inverse or direct) square root
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_invsqrt_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_invsqrt_p25519_alt)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define a sp, #0
+#define b sp, #(4*N)
+#define s sp, #(8*N)
+#define t sp, #(12*N)
+
+// Other temporary variables in register
+
+#define res x19
+
+// Total size to reserve on the stack
+
+#define NSPACE #(16*N)
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+// Macros wrapping up calls to the local subroutines
+
+#define mulp(dest,src1,src2)                                            \
+        add     x0, dest __LF                                              \
+        add     x1, src1 __LF                                              \
+        add     x2, src2 __LF                                              \
+        bl      bignum_invsqrt_p25519_alt_mul_p25519
+
+#define nsqr(dest,n,src)                                                \
+        add     x0, dest __LF                                              \
+        mov     x1, n __LF                                                 \
+        add     x2, src __LF                                               \
+        bl      bignum_invsqrt_p25519_alt_nsqr_p25519
+
+S2N_BN_SYMBOL(bignum_invsqrt_p25519_alt):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x30, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Set up reduced version of the input argument a = x mod p_25519. Then
+// get the candidate inverse square root s = a^{252-3}
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        mov     x7, #19
+        lsr     x6, x5, #63
+        madd    x6, x7, x6, x7
+        adds    x2, x2, x6
+        adcs    x3, x3, xzr
+        adcs    x4, x4, xzr
+        orr     x5, x5, #0x8000000000000000
+        adcs    x5, x5, xzr
+        csel    x6, x7, xzr, lo
+        subs    x2, x2, x6
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, xzr
+        sbc     x5, x5, xzr
+        and     x5, x5, #0x7fffffffffffffff
+        stp     x2, x3, [a]
+        stp     x4, x5, [a+16]
+
+  // Power 2^2 - 1 = 3
+
+        nsqr(t,1,a)
+        mulp(t,t,a)
+
+  // Power 2^4 - 1 = 15
+
+        nsqr(s,2,t)
+        mulp(t,s,t)
+
+  // Power 2^5 - 1 = 31
+
+        nsqr(s,1,t)
+        mulp(b,s,a)
+
+  // Power 2^10 - 1
+
+        nsqr(s,5,b)
+        mulp(t,s,b)
+
+  // Power 2^20 - 1
+
+        nsqr(s,10,t)
+        mulp(t,s,t)
+
+  // Power 2^25 - 1
+
+        nsqr(s,5,t)
+        mulp(b,s,b)
+
+  // Power 2^50 - 1
+
+        nsqr(s,25,b)
+        mulp(t,s,b)
+
+  // Power 2^100 - 1
+        nsqr(s,50,t)
+        mulp(t,s,t)
+
+  // Power 2^125 - 1
+
+        nsqr(s,25,t)
+        mulp(b,s,b)
+
+  // Power 2^250 - 1
+
+        nsqr(s,125,b)
+        mulp(b,s,b)
+
+  // Power 2^252 - 3
+
+        nsqr(s,2,b)
+        mulp(s,s,a)
+
+// s = a^{2^252-3} is now one candidate inverse square root.
+// Generate the other one t = s * j_25519 where j_25519 = sqrt(-1)
+
+        movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0)
+        movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478)
+        movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7)
+        movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b)
+        stp     x0, x1, [t]
+        stp     x2, x3, [t+16]
+        mulp(t,s,t)
+
+// Now multiplex between them according to whether a * s^2 = 1
+
+        nsqr(b,1,s)
+        mulp(b,a,b)
+
+        ldp     x10, x11, [b]
+        eor     x10, x10, #1
+        ldp     x12, x13, [b+16]
+        orr     x10, x10, x11
+        orr     x12, x12, x13
+        orr     x10, x10, x12
+        cmp     x10, xzr
+
+        ldp     x10, x11, [s]
+        ldp     x14, x15, [t]
+        csel    x10, x10, x14, eq
+        csel    x11, x11, x15, eq
+        ldp     x12, x13, [s+16]
+        ldp     x16, x17, [t+16]
+        csel    x12, x12, x16, eq
+        csel    x13, x13, x17, eq
+
+// For definiteness, choose "positive" (LSB=0) inverse square root
+
+        mov     x14, #-19
+        subs    x14, x14, x10
+        mov     x16, #-1
+        sbcs    x15, x16, x11
+        mov     x17, #0x7FFFFFFFFFFFFFFF
+        sbcs    x16, x16, x12
+        sbc     x17, x17, x13
+
+        tst     x10, #1
+        csel    x10, x10, x14, eq
+        csel    x11, x11, x15, eq
+        csel    x12, x12, x16, eq
+        csel    x13, x13, x17, eq
+
+        mov     x2, res
+        stp     x10, x11, [x2]
+        stp     x12, x13, [x2, #16]
+
+// Determine if it is is indeed an inverse square root, also distinguishing
+// the degenerate x * z^2 == 0 (mod p_25519) case, which is equivalent to
+// x == 0 (mod p_25519). Hence return the Legendre-Jacobi symbol as required.
+
+        add     x0, b
+        mov     x1, #1
+        bl      bignum_invsqrt_p25519_alt_nsqr_p25519
+        mulp(b,a,b)
+
+        ldp     x10, x11, [b]
+        eor     x14, x10, #1
+        ldp     x12, x13, [b+16]
+        orr     x10, x10, x11
+        orr     x14, x14, x11
+        orr     x12, x12, x13
+        orr     x10, x10, x12
+        orr     x14, x14, x12
+
+        cmp     x14, xzr
+        mov     x0, #1
+        cneg    x0, x0, ne
+
+        cmp     x10, xzr
+        csel    x0, x0, xzr, ne
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x19, x30, [sp], 16
+        ret
+
+// *************************************************************
+// Local z = x * y
+// *************************************************************
+
+bignum_invsqrt_p25519_alt_mul_p25519:
+        ldp     x3, x4, [x1]
+        ldp     x7, x8, [x2]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x2, #16]
+        mul     x11, x3, x9
+        umulh   x15, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x16, x3, x10
+        adcs    x15, x15, x11
+        adc     x16, x16, xzr
+        ldp     x5, x6, [x1, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x15, x15, x11
+        mul     x11, x4, x10
+        adcs    x16, x16, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x15, x15, x11
+        umulh   x11, x4, x9
+        adcs    x16, x16, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x15, x15, x11
+        mul     x11, x5, x9
+        adcs    x16, x16, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x15, x15, x11
+        umulh   x11, x5, x8
+        adcs    x16, x16, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x15, x15, x11
+        mul     x11, x6, x8
+        adcs    x16, x16, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x16, x16, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        mov     x7, #38
+        mul     x11, x7, x16
+        umulh   x9, x7, x16
+        adds    x12, x12, x11
+        mul     x11, x7, x3
+        umulh   x3, x7, x3
+        adcs    x13, x13, x11
+        mul     x11, x7, x4
+        umulh   x4, x7, x4
+        adcs    x14, x14, x11
+        mul     x11, x7, x5
+        umulh   x5, x7, x5
+        adcs    x15, x15, x11
+        cset    x16, hs
+        adds    x15, x15, x4
+        adc     x16, x16, x5
+        cmn     x15, x15
+        orr     x15, x15, #0x8000000000000000
+        adc     x8, x16, x16
+        mov     x7, #19
+        madd    x11, x7, x8, x7
+        adds    x12, x12, x11
+        adcs    x13, x13, x9
+        adcs    x14, x14, x3
+        adcs    x15, x15, xzr
+        csel    x7, x7, xzr, lo
+        subs    x12, x12, x7
+        sbcs    x13, x13, xzr
+        sbcs    x14, x14, xzr
+        sbc     x15, x15, xzr
+        and     x15, x15, #0x7fffffffffffffff
+        stp     x12, x13, [x0]
+        stp     x14, x15, [x0, #16]
+        ret
+
+// *************************************************************
+// Local z = 2^n * x
+// *************************************************************
+
+bignum_invsqrt_p25519_alt_nsqr_p25519:
+
+// Copy input argument into [x5;x4;x3;x2] (overwriting input pointer x20
+
+        ldp     x6, x3, [x2]
+        ldp     x4, x5, [x2, #16]
+        mov     x2, x6
+
+// Main squaring loop, accumulating in [x5;x4;x3;x2] consistently and
+// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38
+
+bignum_invsqrt_p25519_alt_loop:
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x7, x2, x4
+        umulh   x6, x2, x4
+        adds    x10, x10, x7
+        adcs    x11, x11, x6
+        mul     x7, x3, x4
+        umulh   x6, x3, x4
+        adc     x6, x6, xzr
+        adds    x11, x11, x7
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x6
+        mul     x7, x3, x5
+        umulh   x6, x3, x5
+        adc     x6, x6, xzr
+        adds    x12, x12, x7
+        adcs    x13, x13, x6
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x6, hs
+        umulh   x7, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x7
+        mul     x7, x3, x3
+        adcs    x10, x10, x7
+        umulh   x7, x3, x3
+        adcs    x11, x11, x7
+        mul     x7, x4, x4
+        adcs    x12, x12, x7
+        umulh   x7, x4, x4
+        adcs    x13, x13, x7
+        mul     x7, x5, x5
+        adcs    x14, x14, x7
+        umulh   x7, x5, x5
+        adc     x6, x6, x7
+        mov     x3, #38
+        mul     x7, x3, x12
+        umulh   x4, x3, x12
+        adds    x8, x8, x7
+        mul     x7, x3, x13
+        umulh   x13, x3, x13
+        adcs    x9, x9, x7
+        mul     x7, x3, x14
+        umulh   x14, x3, x14
+        adcs    x10, x10, x7
+        mul     x7, x3, x6
+        umulh   x6, x3, x6
+        adcs    x11, x11, x7
+        cset    x12, hs
+        adds    x11, x11, x14
+        adc     x12, x12, x6
+        cmn     x11, x11
+        bic     x11, x11, #0x8000000000000000
+        adc     x2, x12, x12
+        mov     x3, #0x13
+        mul     x7, x3, x2
+        adds    x2, x8, x7
+        adcs    x3, x9, x4
+        adcs    x4, x10, x13
+        adc     x5, x11, xzr
+
+// Loop as applicable
+
+        subs    x1, x1, #1
+        bne     bignum_invsqrt_p25519_alt_loop
+
+// We know the intermediate result x < 2^256 - 38, and now we do strict
+// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255
+// which is equivalent to a "pl" condition.
+
+        adds    x6, x2, #19
+        adcs    x7, x3, xzr
+        adcs    x8, x4, xzr
+        adcs    x9, x5, xzr
+
+        csel    x2, x2, x6, pl
+        csel    x3, x3, x7, pl
+        csel    x4, x4, x8, pl
+        csel    x5, x5, x9, pl
+        bic     x5, x5, #0x8000000000000000
+
+// Copy result back into destination and return
+
+        stp     x2, x3, [x0]
+        stp     x4, x5, [x0, #16]
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519.S
similarity index 73%
rename from third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519.S
index 0171271872d..e6fbc4e3fd5 100644
--- a/third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519.S
@@ -39,9 +39,9 @@
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 // Single round of modular reduction mod_n25519, mapping
@@ -50,27 +50,27 @@
 // close to the loop body of the bignum_mod_n25519 function.
 
 #define reduce(m4,m3,m2,m1,m0)                          \
-        extr    q, m4, m3, #60;                         \
-        and     m3, m3, #0x0FFFFFFFFFFFFFFF;            \
-        sub     q, q, m4, lsr #60;                      \
-        and     t0, m4, #0xF000000000000000;            \
-        add     m3, m3, t0;                             \
-        mul     t0, n0, q;                              \
-        mul     t1, n1, q;                              \
-        umulh   t2, n0, q;                              \
-        adds    t1, t1, t2;                             \
-        umulh   t2, n1, q;                              \
-        adc     t2, t2, xzr;                            \
-        subs    m0, m0, t0;                             \
-        sbcs    m1, m1, t1;                             \
-        sbcs    m2, m2, t2;                             \
-        sbcs    m3, m3, xzr;                            \
-        csel    t0, n0, xzr, cc;                        \
-        csel    t1, n1, xzr, cc;                        \
-        adds    m0, m0, t0;                             \
-        and     t2, t0, #0x1000000000000000;            \
-        adcs    m1, m1, t1;                             \
-        adcs    m2, m2, xzr;                            \
+        extr    q, m4, m3, #60 __LF                        \
+        and     m3, m3, #0x0FFFFFFFFFFFFFFF __LF           \
+        sub     q, q, m4, lsr #60 __LF                     \
+        and     t0, m4, #0xF000000000000000 __LF           \
+        add     m3, m3, t0 __LF                            \
+        mul     t0, n0, q __LF                             \
+        mul     t1, n1, q __LF                             \
+        umulh   t2, n0, q __LF                             \
+        adds    t1, t1, t2 __LF                            \
+        umulh   t2, n1, q __LF                             \
+        adc     t2, t2, xzr __LF                           \
+        subs    m0, m0, t0 __LF                            \
+        sbcs    m1, m1, t1 __LF                            \
+        sbcs    m2, m2, t2 __LF                            \
+        sbcs    m3, m3, xzr __LF                           \
+        csel    t0, n0, xzr, cc __LF                       \
+        csel    t1, n1, xzr, cc __LF                       \
+        adds    m0, m0, t0 __LF                            \
+        and     t2, t0, #0x1000000000000000 __LF           \
+        adcs    m1, m1, t1 __LF                            \
+        adcs    m2, m2, xzr __LF                           \
         adc     m3, m3, t2
 
 // Special case of "reduce" with m4 = 0. As well as not using m4,
@@ -78,24 +78,24 @@
 // versus min (floor(m/2^252)) (2^63-1).
 
 #define reduce0(m3,m2,m1,m0)                            \
-        lsr     q, m3, #60;                             \
-        and     m3, m3, #0x0FFFFFFFFFFFFFFF;            \
-        mul     t0, n0, q;                              \
-        mul     t1, n1, q;                              \
-        umulh   t2, n0, q;                              \
-        adds    t1, t1, t2;                             \
-        umulh   t2, n1, q;                              \
-        adc     t2, t2, xzr;                            \
-        subs    m0, m0, t0;                             \
-        sbcs    m1, m1, t1;                             \
-        sbcs    m2, m2, t2;                             \
-        sbcs    m3, m3, xzr;                            \
-        csel    t0, n0, xzr, cc;                        \
-        csel    t1, n1, xzr, cc;                        \
-        adds    m0, m0, t0;                             \
-        and     t2, t0, #0x1000000000000000;            \
-        adcs    m1, m1, t1;                             \
-        adcs    m2, m2, xzr;                            \
+        lsr     q, m3, #60 __LF                            \
+        and     m3, m3, #0x0FFFFFFFFFFFFFFF __LF           \
+        mul     t0, n0, q __LF                             \
+        mul     t1, n1, q __LF                             \
+        umulh   t2, n0, q __LF                             \
+        adds    t1, t1, t2 __LF                            \
+        umulh   t2, n1, q __LF                             \
+        adc     t2, t2, xzr __LF                           \
+        subs    m0, m0, t0 __LF                            \
+        sbcs    m1, m1, t1 __LF                            \
+        sbcs    m2, m2, t2 __LF                            \
+        sbcs    m3, m3, xzr __LF                           \
+        csel    t0, n0, xzr, cc __LF                       \
+        csel    t1, n1, xzr, cc __LF                       \
+        adds    m0, m0, t0 __LF                            \
+        and     t2, t0, #0x1000000000000000 __LF           \
+        adcs    m1, m1, t1 __LF                            \
+        adcs    m2, m2, xzr __LF                           \
         adc     m3, m3, t2
 
 S2N_BN_SYMBOL(bignum_madd_n25519):
diff --git a/third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519_alt.S
similarity index 66%
rename from third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519_alt.S
index d1cdfb2c3b8..45d984f4514 100644
--- a/third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519_alt.S
@@ -39,9 +39,9 @@
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 // Single round of modular reduction mod_n25519, mapping
@@ -50,27 +50,27 @@
 // close to the loop body of the bignum_mod_n25519 function.
 
 #define reduce(m4,m3,m2,m1,m0)                          \
-        extr    q, m4, m3, #60;                         \
-        and     m3, m3, #0x0FFFFFFFFFFFFFFF;            \
-        sub     q, q, m4, lsr #60;                      \
-        and     t0, m4, #0xF000000000000000;            \
-        add     m3, m3, t0;                             \
-        mul     t0, n0, q;                              \
-        mul     t1, n1, q;                              \
-        umulh   t2, n0, q;                              \
-        adds    t1, t1, t2;                             \
-        umulh   t2, n1, q;                              \
-        adc     t2, t2, xzr;                            \
-        subs    m0, m0, t0;                             \
-        sbcs    m1, m1, t1;                             \
-        sbcs    m2, m2, t2;                             \
-        sbcs    m3, m3, xzr;                            \
-        csel    t0, n0, xzr, cc;                        \
-        csel    t1, n1, xzr, cc;                        \
-        adds    m0, m0, t0;                             \
-        and     t2, t0, #0x1000000000000000;            \
-        adcs    m1, m1, t1;                             \
-        adcs    m2, m2, xzr;                            \
+        extr    q, m4, m3, #60 __LF                        \
+        and     m3, m3, #0x0FFFFFFFFFFFFFFF __LF           \
+        sub     q, q, m4, lsr #60 __LF                     \
+        and     t0, m4, #0xF000000000000000 __LF           \
+        add     m3, m3, t0 __LF                            \
+        mul     t0, n0, q __LF                             \
+        mul     t1, n1, q __LF                             \
+        umulh   t2, n0, q __LF                             \
+        adds    t1, t1, t2 __LF                            \
+        umulh   t2, n1, q __LF                             \
+        adc     t2, t2, xzr __LF                           \
+        subs    m0, m0, t0 __LF                            \
+        sbcs    m1, m1, t1 __LF                            \
+        sbcs    m2, m2, t2 __LF                            \
+        sbcs    m3, m3, xzr __LF                           \
+        csel    t0, n0, xzr, cc __LF                       \
+        csel    t1, n1, xzr, cc __LF                       \
+        adds    m0, m0, t0 __LF                            \
+        and     t2, t0, #0x1000000000000000 __LF           \
+        adcs    m1, m1, t1 __LF                            \
+        adcs    m2, m2, xzr __LF                           \
         adc     m3, m3, t2
 
 // Special case of "reduce" with m4 = 0. As well as not using m4,
@@ -78,24 +78,24 @@
 // versus min (floor(m/2^252)) (2^63-1).
 
 #define reduce0(m3,m2,m1,m0)                            \
-        lsr     q, m3, #60;                             \
-        and     m3, m3, #0x0FFFFFFFFFFFFFFF;            \
-        mul     t0, n0, q;                              \
-        mul     t1, n1, q;                              \
-        umulh   t2, n0, q;                              \
-        adds    t1, t1, t2;                             \
-        umulh   t2, n1, q;                              \
-        adc     t2, t2, xzr;                            \
-        subs    m0, m0, t0;                             \
-        sbcs    m1, m1, t1;                             \
-        sbcs    m2, m2, t2;                             \
-        sbcs    m3, m3, xzr;                            \
-        csel    t0, n0, xzr, cc;                        \
-        csel    t1, n1, xzr, cc;                        \
-        adds    m0, m0, t0;                             \
-        and     t2, t0, #0x1000000000000000;            \
-        adcs    m1, m1, t1;                             \
-        adcs    m2, m2, xzr;                            \
+        lsr     q, m3, #60 __LF                            \
+        and     m3, m3, #0x0FFFFFFFFFFFFFFF __LF           \
+        mul     t0, n0, q __LF                             \
+        mul     t1, n1, q __LF                             \
+        umulh   t2, n0, q __LF                             \
+        adds    t1, t1, t2 __LF                            \
+        umulh   t2, n1, q __LF                             \
+        adc     t2, t2, xzr __LF                           \
+        subs    m0, m0, t0 __LF                            \
+        sbcs    m1, m1, t1 __LF                            \
+        sbcs    m2, m2, t2 __LF                            \
+        sbcs    m3, m3, xzr __LF                           \
+        csel    t0, n0, xzr, cc __LF                       \
+        csel    t1, n1, xzr, cc __LF                       \
+        adds    m0, m0, t0 __LF                            \
+        and     t2, t0, #0x1000000000000000 __LF           \
+        adcs    m1, m1, t1 __LF                            \
+        adcs    m2, m2, xzr __LF                           \
         adc     m3, m3, t2
 
 S2N_BN_SYMBOL(bignum_madd_n25519_alt):
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_m25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_m25519_4.S
new file mode 100644
index 00000000000..75f5e7ece4e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_m25519_4.S
@@ -0,0 +1,81 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod m_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_m25519_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the group order of curve25519/edwards25519.
+// This is the full group order, 8 * the standard basepoint order.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_m25519_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_m25519_4)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define n0 x2
+#define n1 x3
+#define n2 x4
+#define n3 x5
+
+#define d0 x6
+#define d1 x7
+#define d2 x8
+#define d3 x9
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_mod_m25519_4):
+
+// Load the complicated three words of m_25519 (the other being n2 = 0)
+
+        movbig( n0, #0xc093, #0x18d2, #0xe7ae, #0x9f68)
+        movbig( n1, #0xa6f7, #0xcef5, #0x17bc, #0xe6b2)
+        mov     n3, #0x8000000000000000
+
+// Load the input number
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Do the subtraction.
+
+        subs    n0, d0, n0
+        sbcs    n1, d1, n1
+        sbcs    n2, d2, xzr
+        sbcs    n3, d3, n3
+
+// Now if the carry is *clear* (inversion at work) the subtraction carried
+// and hence we should have done nothing, so we reset each n_i = d_i
+
+        csel    n0, d0, n0, cc
+        csel    n1, d1, n1, cc
+        csel    n2, d2, n2, cc
+        csel    n3, d3, n3, cc
+
+// Store the end result
+
+        stp     n0, n1, [z]
+        stp     n2, n3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/curve25519/bignum_mod_n25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519.S
similarity index 89%
rename from third_party/s2n-bignum/arm/curve25519/bignum_mod_n25519.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519.S
index 2fe18ac7545..591baa50359 100644
--- a/third_party/s2n-bignum/arm/curve25519/bignum_mod_n25519.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519.S
@@ -45,9 +45,9 @@
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 S2N_BN_SYMBOL(bignum_mod_n25519):
@@ -55,7 +55,7 @@ S2N_BN_SYMBOL(bignum_mod_n25519):
 // If the input is already <= 3 words long, go to a trivial "copy" path
 
         cmp     k, #4
-        bcc     short
+        bcc     bignum_mod_n25519_short
 
 // Otherwise load the top 4 digits (top-down) and reduce k by 4
 // This [m3;m2;m1;m0] is the initial x where we begin reduction.
@@ -108,7 +108,7 @@ S2N_BN_SYMBOL(bignum_mod_n25519):
 // is similar to the sequence above except for the more refined quotient
 // estimation process.
 
-        cbz     k, writeback
+        cbz     k, bignum_mod_n25519_writeback
 
 bignum_mod_n25519_loop:
 
@@ -158,28 +158,28 @@ bignum_mod_n25519_loop:
 
 // Finally write back [m3;m2;m1;m0] and return
 
-writeback:
+bignum_mod_n25519_writeback:
         stp     m0, m1, [z]
         stp     m2, m3, [z, #16]
         ret
 
 // Short case: just copy the input with zero-padding
 
-short:
+bignum_mod_n25519_short:
         mov     m0, xzr
         mov     m1, xzr
         mov     m2, xzr
         mov     m3, xzr
 
-        cbz     k, writeback
+        cbz     k, bignum_mod_n25519_writeback
         ldr     m0, [x]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_n25519_writeback
         ldr     m1, [x, #8]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_n25519_writeback
         ldr     m2, [x, #16]
-        b       writeback
+        b       bignum_mod_n25519_writeback
 
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519_4.S
new file mode 100644
index 00000000000..ab86e7df944
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519_4.S
@@ -0,0 +1,104 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_n25519_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the order of the curve25519/edwards25519 basepoint.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n25519_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n25519_4)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define n0 x2
+#define n1 x3
+
+#define d0 x4
+#define d1 x5
+#define d2 x6
+#define d3 x7
+
+#define q x8
+
+#define m0 x9
+#define m1 x10
+#define m2 x11
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_mod_n25519_4):
+
+// Load the complicated three words of n_25519.
+// The others are n2 = 0 and n3 = 0x1000000000000000, which
+// are handled a bit differently
+
+        movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed)
+        movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6)
+
+// Load the input number
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Get the quotient estimate q = floor(x/2^252).
+// Also delete it from d3, in effect doing x' = x - q * 2^252
+
+        lsr     q, d3, #60
+        and     d3, d3, #0x0FFFFFFFFFFFFFFF
+
+// Multiply [m2;m1;m0] = q * [n1;n0]
+
+        mul     m0, n0, q
+        mul     m1, n1, q
+        umulh   m2, n0, q
+        adds    m1, m1, m2
+        umulh   m2, n1, q
+        adc     m2, m2, xzr
+
+// Subtract [d3;d2;d1;d0] = x' - q * [n1;n0] = x - q * n_25519
+
+        subs    d0, d0, m0
+        sbcs    d1, d1, m1
+        sbcs    d2, d2, m2
+        sbcs    d3, d3, xzr
+
+// If this borrows (CF = 0 because of inversion), add back n_25519.
+// The masked n3 digit exploits the fact that bit 60 of n0 is set.
+
+        csel    n0, n0, xzr, cc
+        csel    n1, n1, xzr, cc
+
+        adds    d0, d0, n0
+        adcs    d1, d1, n1
+        and     n0, n0, #0x1000000000000000
+        adcs    d2, d2, xzr
+        adc     d3, d3, n0
+
+// Store the end result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_p25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_p25519_4.S
new file mode 100644
index 00000000000..4502ef480d1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_p25519_4.S
@@ -0,0 +1,72 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_p25519_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p25519_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p25519_4)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define q x6
+#define c x7
+
+S2N_BN_SYMBOL(bignum_mod_p25519_4):
+
+// Load the inputs as [d3;d2;d1;d0]
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Letting x = 2^255 * h + l where h is the top bit, the provisional quotient
+// is q = h + 1, which is either correct or 1 too high.
+
+        mov     c, #19
+        lsr     q, d3, #63
+        madd    q, c, q, c
+
+// Writing the provisional remainder as r = x - (2^255 - 19) * q we
+// compute r' = (2^255 + l) + 19 * q = r + 2^256
+
+        adds    d0, d0, q
+        adcs    d1, d1, xzr
+        adcs    d2, d2, xzr
+        orr     d3, d3, #0x8000000000000000
+        adcs    d3, d3, xzr
+
+// Now r < 0 <=> r' < 2^256 <=> ~CF and in this case we correct by adding
+// 2^255 - 19, or in fact subtracting 19 and masking to 255 bits.
+
+        csel    q, c, xzr, cc
+        subs    d0, d0, q
+        sbcs    d1, d1, xzr
+        sbcs    d2, d2, xzr
+        sbc     d3, d3, xzr
+        and     d3, d3, #~0x8000000000000000
+
+// Store the end result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519.S
new file mode 100644
index 00000000000..2855e2ddb90
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519.S
@@ -0,0 +1,334 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply modulo p_25519, z := (x * y) mod p_25519
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_mul_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p25519)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a0short w3
+#define a1 x4
+#define b0 x5
+#define b0short w5
+#define b1 x6
+
+#define u0 x7
+#define u1 x8
+#define u2 x9
+#define u3 x10
+#define u4 x11
+#define u5 x12
+#define u6 x13
+#define u7 x14
+
+#define u0short w7
+#define u1short w8
+#define u2short w9
+#define u3short w10
+#define u4short w11
+#define u5short w12
+#define u6short w13
+#define u7short w14
+
+#define t  x15
+
+#define sgn x16
+#define ysgn x17
+
+// These are aliases to registers used elsewhere including input pointers.
+// By the time they are used this does not conflict with other uses.
+
+#define m0 y
+#define m1 ysgn
+#define m2 t
+#define m3 x
+#define u u2
+
+// For the reduction stages, again aliasing other things but not the u's
+
+#define c x3
+#define cshort w3
+#define h x4
+#define l x5
+#define lshort w5
+#define d x6
+#define q x17
+#define qshort w17
+
+S2N_BN_SYMBOL(bignum_mul_p25519):
+
+// Multiply the low halves using Karatsuba 2x2->4 to get [u3,u2,u1,u0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        umull   u0, a0short, b0short
+        lsr     x17, a0, #32
+        umull   x15, w17, b0short
+        lsr     x16, b0, #32
+        umull   u1, w16, w17
+        umull   x16, a0short, w16
+        adds    u0, u0, x15, lsl #32
+        lsr     x15, x15, #32
+        adc     u1, u1, x15
+        adds    u0, u0, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     u1, u1, x16
+
+        mul     u2, a1, b1
+        umulh   u3, a1, b1
+
+        subs    a1, a1, a0
+        cneg    a1, a1, cc
+        csetm   sgn, cc
+
+        adds    u2, u2, u1
+        adc     u3, u3, xzr
+
+        subs    a0, b0, b1
+        cneg    a0, a0, cc
+        cinv    sgn, sgn, cc
+
+        mul     t, a1, a0
+        umulh   a0, a1, a0
+
+        adds    u1, u0, u2
+        adcs    u2, u2, u3
+        adc     u3, u3, xzr
+
+        adds    xzr, sgn, #1
+        eor     t, t, sgn
+        adcs    u1, t, u1
+        eor     a0, a0, sgn
+        adcs    u2, a0, u2
+        adc     u3, u3, sgn
+
+// Multiply the high halves using Karatsuba 2x2->4 to get [u7,u6,u5,u4]
+
+        ldp     a0, a1, [x, #16]
+        ldp     b0, b1, [y, #16]
+
+        umull   u4, a0short, b0short
+        lsr     x17, a0, #32
+        umull   x15, w17, b0short
+        lsr     x16, b0, #32
+        umull   u5, w16, w17
+        umull   x16, a0short, w16
+        adds    u4, u4, x15, lsl #32
+        lsr     x15, x15, #32
+        adc     u5, u5, x15
+        adds    u4, u4, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     u5, u5, x16
+
+        mul     u6, a1, b1
+        umulh   u7, a1, b1
+
+        subs    a1, a1, a0
+        cneg    a1, a1, cc
+        csetm   sgn, cc
+
+        adds    u6, u6, u5
+        adc     u7, u7, xzr
+
+        subs    a0, b0, b1
+        cneg    a0, a0, cc
+        cinv    sgn, sgn, cc
+
+        mul     t, a1, a0
+        umulh   a0, a1, a0
+
+        adds    u5, u4, u6
+        adcs    u6, u6, u7
+        adc     u7, u7, xzr
+
+        adds    xzr, sgn, #1
+        eor     t, t, sgn
+        adcs    u5, t, u5
+        eor     a0, a0, sgn
+        adcs    u6, a0, u6
+        adc     u7, u7, sgn
+
+// Compute  sgn,[a1,a0] = x_hi - x_lo
+// and     ysgn,[b1,b0] = y_lo - y_hi
+// sign-magnitude differences
+
+        ldp     a0, a1, [x, #16]
+        ldp     t, sgn, [x]
+        subs    a0, a0, t
+        sbcs    a1, a1, sgn
+        csetm   sgn, cc
+
+        ldp     t, ysgn, [y]
+        subs    b0, t, b0
+        sbcs    b1, ysgn, b1
+        csetm   ysgn, cc
+
+        eor     a0, a0, sgn
+        subs    a0, a0, sgn
+        eor     a1, a1, sgn
+        sbc     a1, a1, sgn
+
+        eor     b0, b0, ysgn
+        subs    b0, b0, ysgn
+        eor     b1, b1, ysgn
+        sbc     b1, b1, ysgn
+
+// Save the correct sign for the sub-product
+
+        eor     sgn, ysgn, sgn
+
+// Add H' = H + L_top, still in [u7,u6,u5,u4]
+
+        adds    u4, u4, u2
+        adcs    u5, u5, u3
+        adcs    u6, u6, xzr
+        adc     u7, u7, xzr
+
+// Now compute the mid-product as [m3,m2,m1,m0]
+
+        mul     m0, a0, b0
+        umulh   m1, a0, b0
+        mul     m2, a1, b1
+        umulh   m3, a1, b1
+
+        subs    a1, a1, a0
+        cneg    a1, a1, cc
+        csetm   u, cc
+
+        adds    m2, m2, m1
+        adc     m3, m3, xzr
+
+        subs    b1, b0, b1
+        cneg    b1, b1, cc
+        cinv    u, u, cc
+
+        mul     b0, a1, b1
+        umulh   b1, a1, b1
+
+        adds    m1, m0, m2
+        adcs    m2, m2, m3
+        adc     m3, m3, xzr
+
+        adds    xzr, u, #1
+        eor     b0, b0, u
+        adcs    m1, b0, m1
+        eor     b1, b1, u
+        adcs    m2, b1, m2
+        adc     m3, m3, u
+
+// Accumulate the positive mid-terms as [u7,u6,u5,u4,u3,u2]
+
+        adds    u2, u4, u0
+        adcs    u3, u5, u1
+        adcs    u4, u6, u4
+        adcs    u5, u7, u5
+        adcs    u6, u6, xzr
+        adc     u7, u7, xzr
+
+// Add in the sign-adjusted complex term
+
+        adds    xzr, sgn, #1
+        eor     m0, m0, sgn
+        adcs    u2, m0, u2
+        eor     m1, m1, sgn
+        adcs    u3, m1, u3
+        eor     m2, m2, sgn
+        adcs    u4, m2, u4
+        eor     m3, m3, sgn
+        adcs    u5, m3, u5
+        adcs    u6, u6, sgn
+        adc     u7, u7, sgn
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
+// and this is == 38 * h + l (mod p_25519).
+// We do the 38 * h + l using 32-bit multiplies avoiding umulh,
+// and pre-estimate and feed in the next-level quotient
+// q = h + 1 where h = an early version of the high 255 bits.
+// We add 2^255 * h - 19 * (h + 1), so end up offset by 2^255.
+
+        mov     c, #38
+
+        umull   h, u4short, cshort
+        add     h, h, u0short, uxtw
+        lsr     u0, u0, #32
+        lsr     u4, u4, #32
+        umaddl  u4, u4short, cshort, u0
+        mov     u0, h
+
+        umull   h, u5short, cshort
+        add     h, h, u1short, uxtw
+        lsr     u1, u1, #32
+        lsr     u5, u5, #32
+        umaddl  u5, u5short, cshort, u1
+        mov     u1, h
+
+        umull   h, u6short, cshort
+        add     h, h, u2short, uxtw
+        lsr     u2, u2, #32
+        lsr     u6, u6, #32
+        umaddl  u6, u6short, cshort, u2
+        mov     u2, h
+
+        umull   h, u7short, cshort
+        add     h, h, u3short, uxtw
+        lsr     u3, u3, #32
+        lsr     u7, u7, #32
+        umaddl  u7, u7short, cshort, u3
+        mov     u3, h
+
+        lsr     q, u7, #31
+
+        mov     l, #19
+        umaddl  l, lshort, qshort, l
+        add     u0, u0, l
+
+        adds    u0, u0, u4, lsl #32
+        extr    c, u5, u4, #32
+        adcs    u1, u1, c
+        extr    c, u6, u5, #32
+        adcs    u2, u2, c
+        extr    c, u7, u6, #32
+        lsl     l, q, #63
+        eor     u3, u3, l
+        adc     u3, u3, c
+
+// Now we correct by a final 2^255-19 if the top bit is clear
+// meaning that the "real" pre-reduced result is negative.
+
+        mov     c, #19
+        tst     u3, #0x8000000000000000
+        csel    c, c, xzr, pl
+        subs    u0, u0, c
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+        and     u3, u3, #~0x8000000000000000
+
+// Write back result
+
+        stp     u0, u1, [x0]
+        stp     u2, u3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519_alt.S
new file mode 100644
index 00000000000..393069b6c79
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519_alt.S
@@ -0,0 +1,203 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply modulo p_25519, z := (x * y) mod p_25519
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_mul_p25519_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p25519_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define l x11
+
+#define u0 x12
+#define u1 x13
+#define u2 x14
+#define u3 x15
+#define u4 x16
+
+// These alias to the input arguments when no longer needed
+
+#define u5 a0
+#define u6 a1
+#define u7 a2
+
+#define c b0
+#define q b1
+#define h b2
+
+S2N_BN_SYMBOL(bignum_mul_p25519_alt):
+
+// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        mul     u0, a0, b0
+        umulh   u1, a0, b0
+        mul     l, a0, b1
+        umulh   u2, a0, b1
+        adds    u1, u1, l
+
+        ldp     b2, b3, [y, #16]
+
+        mul     l, a0, b2
+        umulh   u3, a0, b2
+        adcs    u2, u2, l
+
+        mul     l, a0, b3
+        umulh   u4, a0, b3
+        adcs    u3, u3, l
+        adc     u4, u4, xzr
+
+        ldp     a2, a3, [x, #16]
+
+// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0]
+
+        mul     l, a1, b0
+        adds    u1, u1, l
+        mul     l, a1, b1
+        adcs    u2, u2, l
+        mul     l, a1, b2
+        adcs    u3, u3, l
+        mul     l, a1, b3
+        adcs    u4, u4, l
+        umulh   u5, a1, b3
+        adc     u5, u5, xzr
+
+        umulh   l, a1, b0
+        adds    u2, u2, l
+        umulh   l, a1, b1
+        adcs    u3, u3, l
+        umulh   l, a1, b2
+        adcs    u4, u4, l
+        adc     u5, u5, xzr
+
+// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0]
+
+        mul     l, a2, b0
+        adds    u2, u2, l
+        mul     l, a2, b1
+        adcs    u3, u3, l
+        mul     l, a2, b2
+        adcs    u4, u4, l
+        mul     l, a2, b3
+        adcs    u5, u5, l
+        umulh   u6, a2, b3
+        adc     u6, u6, xzr
+
+        umulh   l, a2, b0
+        adds    u3, u3, l
+        umulh   l, a2, b1
+        adcs    u4, u4, l
+        umulh   l, a2, b2
+        adcs    u5, u5, l
+        adc     u6, u6, xzr
+
+// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0]
+
+        mul     l, a3, b0
+        adds    u3, u3, l
+        mul     l, a3, b1
+        adcs    u4, u4, l
+        mul     l, a3, b2
+        adcs    u5, u5, l
+        mul     l, a3, b3
+        adcs    u6, u6, l
+        umulh   u7, a3, b3
+        adc     u7, u7, xzr
+
+        umulh   l, a3, b0
+        adds    u4, u4, l
+        umulh   l, a3, b1
+        adcs    u5, u5, l
+        umulh   l, a3, b2
+        adcs    u6, u6, l
+        adc     u7, u7, xzr
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
+// and this is == 38 * h + l (mod p_25519)
+
+        mov     c, #38
+
+        mul     l, c, u4
+        umulh   h, c, u4
+        adds    u0, u0, l
+
+        mul     l, c, u5
+        umulh   u5, c, u5
+        adcs    u1, u1, l
+
+        mul     l, c, u6
+        umulh   u6, c, u6
+        adcs    u2, u2, l
+
+        mul     l, c, u7
+        umulh   u7, c, u7
+        adcs    u3, u3, l
+        cset    u4, cs
+
+// Compute the top part deferring the [u5,h] addition till the following
+// carry chain. This is enough to get a good quotient estimate and saves
+// a couple of instructions.
+
+        adds    u3, u3, u6
+        adc     u4, u4, u7
+
+// Now we have reduced to 5 digits, 2^255 * H + L = [u4,u3,u2,u1,u0]
+// Use q = H + 1 as the initial quotient estimate, either right or 1 too big.
+
+        adds    xzr, u3, u3
+        orr     u3, u3, #0x8000000000000000
+        adc     q, u4, u4
+        mov     c, #19
+        madd    l, c, q, c
+        adds    u0, u0, l
+        adcs    u1, u1, h
+        adcs    u2, u2, u5
+        adcs    u3, u3, xzr
+
+// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0]
+// So we correct if CF = 0 by subtracting 19, either way masking to
+// 255 bits, i.e. by effectively adding p_25519 to the "full" answer
+
+        csel    c, c, xzr, cc
+        subs    u0, u0, c
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+        bic     u3, u3, #0x8000000000000000
+
+// Write back and return
+
+        stp     u0, u1, [x0]
+        stp     u2, u3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/curve25519/bignum_neg_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_neg_p25519.S
similarity index 100%
rename from third_party/s2n-bignum/arm/curve25519/bignum_neg_p25519.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_neg_p25519.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_optneg_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_optneg_p25519.S
new file mode 100644
index 00000000000..026dbe47813
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_optneg_p25519.S
@@ -0,0 +1,75 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or
+// z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+//
+//    extern void bignum_optneg_p25519
+//     (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = p, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p25519)
+        .text
+        .balign 4
+
+#define z x0
+#define p x1
+#define x x2
+
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define n0 x7
+#define n1 x8
+#define n2 x9
+#define n3 x10
+
+#define c x11
+
+S2N_BN_SYMBOL(bignum_optneg_p25519):
+
+// Load the digits of x as [d3;d2;d1;d0] and compute c = the OR of those digits
+// as well as its proto-negation [n3;n2;n1;n0] = (2^255 - 19) - x
+
+        ldp     d0, d1, [x]
+        mov     n0, #-19
+        orr     c, d0, d1
+        subs    n0, n0, d0
+        mov     n2, #-1
+        sbcs    n1, n2, d1
+        ldp     d2, d3, [x, #16]
+        orr     c, c, d2
+        sbcs    n2, n2, d2
+        mov     n3, #0x7FFFFFFFFFFFFFFF
+        orr     c, c, d3
+        sbc     n3, n3, d3
+
+// Now we return just x if p = 0 or if x = 0 (to avoid giving -0 = p, which
+// is not strictly reduced even though it's correct modulo p). The conditional
+// comparison uses immediate 4 which means ZF.
+
+        cmp     p, xzr
+        ccmp    c, xzr, #4, ne
+
+        csel    d0, n0, d0, ne
+        csel    d1, n1, d1, ne
+        csel    d2, n2, d2, ne
+        csel    d3, n3, d3, ne
+
+// Write back result and return
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519.S
new file mode 100644
index 00000000000..1bcb3aea6ab
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519.S
@@ -0,0 +1,230 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square modulo p_25519, z := (x^2) mod p_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_sqr_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p25519)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+// Variables
+
+#define u0 x2
+#define u1 x3
+#define u2 x4
+#define u3 x5
+#define u4 x6
+#define u5 x7
+#define u6 x8
+#define u7 x9
+
+#define u0short w2
+#define u1short w3
+#define u2short w4
+#define u3short w5
+#define u4short w6
+#define u5short w7
+#define u6short w8
+#define u7short w9
+
+#define c x10
+#define cshort w10
+#define l x11
+#define lshort w11
+#define h x12
+#define hshort w12
+#define q x13
+#define qshort w13
+
+#define t1 x14
+#define t1short w14
+#define t2 x15
+#define t2short w15
+#define t3 x16
+#define t3short w16
+
+S2N_BN_SYMBOL(bignum_sqr_p25519):
+
+// First just a near-clone of bignum_sqr_4_8 to get the square, using
+// different registers to collect full product without writeback.
+
+        ldp     c, l, [x]
+        ldp     h, q, [x, #16]
+        umull   u0, cshort, cshort
+        lsr     t1, c, #32
+        umull   u1, t1short, t1short
+        umull   t1, cshort, t1short
+        adds    u0, u0, t1, lsl #33
+        lsr     t1, t1, #31
+        adc     u1, u1, t1
+        umull   u2, lshort, lshort
+        lsr     t1, l, #32
+        umull   u3, t1short, t1short
+        umull   t1, lshort, t1short
+        mul     t2, c, l
+        umulh   t3, c, l
+        adds    u2, u2, t1, lsl #33
+        lsr     t1, t1, #31
+        adc     u3, u3, t1
+        adds    t2, t2, t2
+        adcs    t3, t3, t3
+        adc     u3, u3, xzr
+        adds    u1, u1, t2
+        adcs    u2, u2, t3
+        adc     u3, u3, xzr
+        umull   u4, hshort, hshort
+        lsr     t1, h, #32
+        umull   u5, t1short, t1short
+        umull   t1, hshort, t1short
+        adds    u4, u4, t1, lsl #33
+        lsr     t1, t1, #31
+        adc     u5, u5, t1
+        umull   u6, qshort, qshort
+        lsr     t1, q, #32
+        umull   u7, t1short, t1short
+        umull   t1, qshort, t1short
+        mul     t2, h, q
+        umulh   t3, h, q
+        adds    u6, u6, t1, lsl #33
+        lsr     t1, t1, #31
+        adc     u7, u7, t1
+        adds    t2, t2, t2
+        adcs    t3, t3, t3
+        adc     u7, u7, xzr
+        adds    u5, u5, t2
+        adcs    u6, u6, t3
+        adc     u7, u7, xzr
+        subs    c, c, h
+        sbcs    l, l, q
+        csetm   t3, cc
+        eor     c, c, t3
+        subs    c, c, t3
+        eor     l, l, t3
+        sbc     l, l, t3
+        adds    u4, u4, u2
+        adcs    u5, u5, u3
+        adcs    u6, u6, xzr
+        adc     u7, u7, xzr
+        umull   h, cshort, cshort
+        lsr     u3, c, #32
+        umull   q, u3short, u3short
+        umull   u3, cshort, u3short
+        adds    h, h, u3, lsl #33
+        lsr     u3, u3, #31
+        adc     q, q, u3
+        umull   t2, lshort, lshort
+        lsr     u3, l, #32
+        umull   t1, u3short, u3short
+        umull   u3, lshort, u3short
+        mul     u2, c, l
+        umulh   t3, c, l
+        adds    t2, t2, u3, lsl #33
+        lsr     u3, u3, #31
+        adc     t1, t1, u3
+        adds    u2, u2, u2
+        adcs    t3, t3, t3
+        adc     t1, t1, xzr
+        adds    q, q, u2
+        adcs    t2, t2, t3
+        adc     t1, t1, xzr
+        adds    u2, u0, u4
+        adcs    u3, u1, u5
+        adcs    u4, u4, u6
+        adcs    u5, u5, u7
+        csetm   t3, cc
+        subs    u2, u2, h
+        sbcs    u3, u3, q
+        sbcs    u4, u4, t2
+        sbcs    u5, u5, t1
+        adcs    u6, u6, t3
+        adc     u7, u7, t3
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
+// and this is == 38 * h + l (mod p_25519).
+// We do the 38 * h + l using 32-bit multiplies avoiding umulh,
+// and pre-estimate and feed in the next-level quotient
+// q = h + 1 where h = an early version of the high 255 bits.
+// We add 2^255 * h - 19 * (h + 1), so end up offset by 2^255.
+
+        mov     c, #38
+
+        umull   h, u4short, cshort
+        add     h, h, u0short, uxtw
+        lsr     u0, u0, #32
+        lsr     u4, u4, #32
+        umaddl  u4, u4short, cshort, u0
+        mov     u0, h
+
+        umull   h, u5short, cshort
+        add     h, h, u1short, uxtw
+        lsr     u1, u1, #32
+        lsr     u5, u5, #32
+        umaddl  u5, u5short, cshort, u1
+        mov     u1, h
+
+        umull   h, u6short, cshort
+        add     h, h, u2short, uxtw
+        lsr     u2, u2, #32
+        lsr     u6, u6, #32
+        umaddl  u6, u6short, cshort, u2
+        mov     u2, h
+
+        umull   h, u7short, cshort
+        add     h, h, u3short, uxtw
+        lsr     u3, u3, #32
+        lsr     u7, u7, #32
+        umaddl  u7, u7short, cshort, u3
+        mov     u3, h
+
+        lsr     q, u7, #31
+
+        mov     l, #19
+        umaddl  l, lshort, qshort, l
+        add     u0, u0, l
+
+        adds    u0, u0, u4, lsl #32
+        extr    c, u5, u4, #32
+        adcs    u1, u1, c
+        extr    c, u6, u5, #32
+        adcs    u2, u2, c
+        extr    c, u7, u6, #32
+        lsl     l, q, #63
+        eor     u3, u3, l
+        adc     u3, u3, c
+
+// Now we correct by a final 2^255-19 if the top bit is clear
+// meaning that the "real" pre-reduced result is negative.
+
+        mov     c, #19
+        tst     u3, #0x8000000000000000
+        csel    c, c, xzr, pl
+        subs    u0, u0, c
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+        and     u3, u3, #~0x8000000000000000
+
+// Write back result
+
+        stp     u0, u1, [x0]
+        stp     u2, u3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519_alt.S
new file mode 100644
index 00000000000..4941076a97c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519_alt.S
@@ -0,0 +1,178 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square modulo p_25519, z := (x^2) mod p_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_sqr_p25519_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p25519_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+
+#define h x6
+#define l x7
+
+#define u0 x8
+#define u1 x9
+#define u2 x10
+#define u3 x11
+#define u4 x12
+#define u5 x13
+#define u6 x14
+
+// Just aliases
+
+#define q a0
+#define c a1
+#define t a2
+#define u7 h
+
+S2N_BN_SYMBOL(bignum_sqr_p25519_alt):
+
+// Load all the elements, set up an initial window [u6;...u1] = [23;03;01]
+// and chain in the addition of 02 + 12 + 13 (no carry-out is possible).
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        ldp     a0, a1, [x]
+
+        mul     u1, a0, a1
+        umulh   u2, a0, a1
+
+        ldp     a2, a3, [x, #16]
+
+        mul     u3, a0, a3
+        umulh   u4, a0, a3
+
+        mul     l, a0, a2
+        umulh   h, a0, a2
+        adds    u2, u2, l
+
+        adcs    u3, u3, h
+        mul     l, a1, a2
+        umulh   h, a1, a2
+        adc     h, h, xzr
+        adds    u3, u3, l
+
+        mul     u5, a2, a3
+        umulh   u6, a2, a3
+
+        adcs    u4, u4, h
+        mul     l, a1, a3
+        umulh   h, a1, a3
+        adc     h, h, xzr
+        adds    u4, u4, l
+
+        adcs    u5, u5, h
+        adc     u6, u6, xzr
+
+// Now just double it; this simple approach seems to work better than extr
+
+        adds    u1, u1, u1
+        adcs    u2, u2, u2
+        adcs    u3, u3, u3
+        adcs    u4, u4, u4
+        adcs    u5, u5, u5
+        adcs    u6, u6, u6
+        cset    u7, cs
+
+// Add the homogeneous terms 00 + 11 + 22 + 33
+
+        umulh   l, a0, a0
+        mul     u0, a0, a0
+        adds    u1, u1, l
+
+        mul     l, a1, a1
+        adcs    u2, u2, l
+        umulh   l, a1, a1
+        adcs    u3, u3, l
+
+        mul     l, a2, a2
+        adcs    u4, u4, l
+        umulh   l, a2, a2
+        adcs    u5, u5, l
+
+        mul     l, a3, a3
+        adcs    u6, u6, l
+        umulh   l, a3, a3
+        adc     u7, u7, l
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
+// and this is == 38 * h + l (mod p_25519)
+
+        mov     c, #38
+
+        mul     l, c, u4
+        umulh   t, c, u4
+        adds    u0, u0, l
+
+        mul     l, c, u5
+        umulh   u5, c, u5
+        adcs    u1, u1, l
+
+        mul     l, c, u6
+        umulh   u6, c, u6
+        adcs    u2, u2, l
+
+        mul     l, c, u7
+        umulh   u7, c, u7
+        adcs    u3, u3, l
+        cset    u4, cs
+
+// Compute the top part deferring the [u5,t] addition till the following
+// carry chain. This is enough to get a good quotient estimate and saves
+// a couple of instructions.
+
+        adds    u3, u3, u6
+        adc     u4, u4, u7
+
+// Now we have reduced to 5 digits, 2^255 * H + L = [u4,u3,u2,u1,u0]
+// Use q = H + 1 as the initial quotient estimate, either right or 1 too big.
+
+        adds    xzr, u3, u3
+        orr     u3, u3, #0x8000000000000000
+        adc     q, u4, u4
+        mov     c, #19
+        madd    l, c, q, c
+        adds    u0, u0, l
+        adcs    u1, u1, t
+        adcs    u2, u2, u5
+        adcs    u3, u3, xzr
+
+// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0]
+// So we correct if CF = 0 by subtracting 19, either way masking to
+// 255 bits, i.e. by effectively adding p_25519 to the "full" answer
+
+        csel    c, c, xzr, cc
+        subs    u0, u0, c
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+        bic     u3, u3, #0x8000000000000000
+
+// Write back and return
+
+        stp     u0, u1, [x0]
+        stp     u2, u3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519.S
new file mode 100644
index 00000000000..da80e48a428
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519.S
@@ -0,0 +1,610 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square root modulo p_25519 = 2^255 - 19
+// Input x[4]; output function return (Legendre symbol) and z[4]
+//
+// extern int64_t bignum_sqrt_p25519(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Given a 4-digit input x, returns a modular square root mod p_25519, i.e.
+// a z such that z^2 == x (mod p_25519), whenever one exists. The square
+// root z is chosen so that its LSB is even (note that p_25519 - z is
+// another square root). The function return is the Legendre/Jacobi symbol
+// (x//p_25519), which indicates whether indeed x has a modular square root
+// and hence whether the result is meaningful:
+//
+//   0: x is divisible by p_25519 and z is the square root 0
+//  +1: x is coprime to p_25519 and z is a square root
+//  -1: x is coprime to p_25519 but not a quadratic residue
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqrt_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqrt_p25519)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define a sp, #0
+#define b sp, #(4*N)
+#define s sp, #(8*N)
+#define t sp, #(12*N)
+
+// Other temporary variables in register
+
+#define res x19
+
+// Total size to reserve on the stack
+
+#define NSPACE #(16*N)
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+// Macros wrapping up calls to the local subroutines
+
+#define mulp(dest,src1,src2)                                            \
+        add     x0, dest __LF                                              \
+        add     x1, src1 __LF                                              \
+        add     x2, src2 __LF                                              \
+        bl      bignum_sqrt_p25519_mul_p25519
+
+#define nsqr(dest,n,src)                                                \
+        add     x0, dest __LF                                              \
+        mov     x1, n __LF                                                 \
+        add     x2, src __LF                                               \
+        bl      bignum_sqrt_p25519_nsqr_p25519
+
+S2N_BN_SYMBOL(bignum_sqrt_p25519):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x30, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Set up reduced version of the input argument a = x mod p_25519. Then
+// get the candidate square root s = a^{252-2}
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        mov     x7, #19
+        lsr     x6, x5, #63
+        madd    x6, x7, x6, x7
+        adds    x2, x2, x6
+        adcs    x3, x3, xzr
+        adcs    x4, x4, xzr
+        orr     x5, x5, #0x8000000000000000
+        adcs    x5, x5, xzr
+        csel    x6, x7, xzr, lo
+        subs    x2, x2, x6
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, xzr
+        sbc     x5, x5, xzr
+        and     x5, x5, #0x7fffffffffffffff
+        stp     x2, x3, [a]
+        stp     x4, x5, [a+16]
+
+  // Power 2^2 - 1 = 3
+
+        nsqr(t,1,a)
+        mulp(t,t,a)
+
+  // Power 2^4 - 1 = 15
+
+        nsqr(s,2,t)
+        mulp(t,s,t)
+
+  // Power 2^5 - 1 = 31
+
+        nsqr(s,1,t)
+        mulp(b,s,a)
+
+  // Power 2^10 - 1
+
+        nsqr(s,5,b)
+        mulp(t,s,b)
+
+  // Power 2^20 - 1
+
+        nsqr(s,10,t)
+        mulp(t,s,t)
+
+  // Power 2^25 - 1
+
+        nsqr(s,5,t)
+        mulp(b,s,b)
+
+  // Power 2^50 - 1
+
+        nsqr(s,25,b)
+        mulp(t,s,b)
+
+  // Power 2^100 - 1
+        nsqr(s,50,t)
+        mulp(t,s,t)
+
+  // Power 2^125 - 1
+
+        nsqr(s,25,t)
+        mulp(b,s,b)
+
+  // Power 2^250 - 1
+
+        nsqr(s,125,b)
+        mulp(b,s,b)
+
+  // Power 2^251 - 1
+
+        nsqr(s,1,b)
+        mulp(t,s,a)
+
+  // Power 2^252 - 2
+
+        nsqr(s,1,t)
+
+// s is now one candidate square root. Generate the other one t = s * j_25519
+
+        movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0)
+        movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478)
+        movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7)
+        movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b)
+        stp     x0, x1, [t]
+        stp     x2, x3, [t+16]
+        mulp(t,s,t)
+
+// Now multiplex between them according to whether s^2 = a
+
+        nsqr(b,1,s)
+
+        ldp     x10, x11, [a]
+        ldp     x14, x15, [b]
+        eor     x10, x10, x14
+        eor     x11, x11, x15
+        orr     x10, x10, x11
+        ldp     x12, x13, [a+16]
+        ldp     x16, x17, [b+16]
+        eor     x12, x12, x16
+        eor     x13, x13, x17
+        orr     x12, x12, x13
+        orr     x10, x10, x12
+        cmp     x10, xzr
+
+        ldp     x10, x11, [s]
+        ldp     x14, x15, [t]
+        csel    x10, x10, x14, eq
+        csel    x11, x11, x15, eq
+        ldp     x12, x13, [s+16]
+        ldp     x16, x17, [t+16]
+        csel    x12, x12, x16, eq
+        csel    x13, x13, x17, eq
+
+// For definiteness, choose "positive" (LSB=0) square root
+
+        mov     x14, #-19
+        subs    x14, x14, x10
+        mov     x16, #-1
+        sbcs    x15, x16, x11
+        mov     x17, #0x7FFFFFFFFFFFFFFF
+        sbcs    x16, x16, x12
+        sbc     x17, x17, x13
+
+        tst     x10, #1
+        csel    x10, x10, x14, eq
+        csel    x11, x11, x15, eq
+        csel    x12, x12, x16, eq
+        csel    x13, x13, x17, eq
+
+        mov     x2, res
+        stp     x10, x11, [x2]
+        stp     x12, x13, [x2, #16]
+
+// Determine if it is is indeed a square root and also if a = 0
+// Hence return the Legendre-Jacobi symbol as required.
+
+        add     x0, b
+        mov     x1, #1
+        bl      bignum_sqrt_p25519_nsqr_p25519
+
+        ldp     x10, x11, [a]
+        ldp     x14, x15, [b]
+        eor     x14, x10, x14
+        eor     x15, x11, x15
+        orr     x14, x14, x15
+        ldp     x12, x13, [a+16]
+        ldp     x16, x17, [b+16]
+        eor     x16, x12, x16
+        eor     x17, x13, x17
+        orr     x16, x16, x17
+        orr     x14, x14, x16
+        cmp     x14, xzr
+        mov     x0, #1
+        cneg    x0, x0, ne
+
+        orr     x10, x10, x11
+        orr     x12, x12, x13
+        orr     x10, x10, x12
+        cmp     x10, xzr
+        csel    x0, x0, xzr, ne
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x19, x30, [sp], 16
+        ret
+
+// *************************************************************
+// Local z = x * y
+// *************************************************************
+
+bignum_sqrt_p25519_mul_p25519:
+        ldp     x3, x4, [x1]
+        ldp     x5, x6, [x2]
+        umull   x7, w3, w5
+        lsr     x17, x3, #32
+        umull   x15, w17, w5
+        lsr     x16, x5, #32
+        umull   x8, w16, w17
+        umull   x16, w3, w16
+        adds    x7, x7, x15, lsl #32
+        lsr     x15, x15, #32
+        adc     x8, x8, x15
+        adds    x7, x7, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     x8, x8, x16
+        mul     x9, x4, x6
+        umulh   x10, x4, x6
+        subs    x4, x4, x3
+        cneg    x4, x4, lo
+        csetm   x16, lo
+        adds    x9, x9, x8
+        adc     x10, x10, xzr
+        subs    x3, x5, x6
+        cneg    x3, x3, lo
+        cinv    x16, x16, lo
+        mul     x15, x4, x3
+        umulh   x3, x4, x3
+        adds    x8, x7, x9
+        adcs    x9, x9, x10
+        adc     x10, x10, xzr
+        cmn     x16, #1
+        eor     x15, x15, x16
+        adcs    x8, x15, x8
+        eor     x3, x3, x16
+        adcs    x9, x3, x9
+        adc     x10, x10, x16
+        ldp     x3, x4, [x1, #16]
+        ldp     x5, x6, [x2, #16]
+        umull   x11, w3, w5
+        lsr     x17, x3, #32
+        umull   x15, w17, w5
+        lsr     x16, x5, #32
+        umull   x12, w16, w17
+        umull   x16, w3, w16
+        adds    x11, x11, x15, lsl #32
+        lsr     x15, x15, #32
+        adc     x12, x12, x15
+        adds    x11, x11, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     x12, x12, x16
+        mul     x13, x4, x6
+        umulh   x14, x4, x6
+        subs    x4, x4, x3
+        cneg    x4, x4, lo
+        csetm   x16, lo
+        adds    x13, x13, x12
+        adc     x14, x14, xzr
+        subs    x3, x5, x6
+        cneg    x3, x3, lo
+        cinv    x16, x16, lo
+        mul     x15, x4, x3
+        umulh   x3, x4, x3
+        adds    x12, x11, x13
+        adcs    x13, x13, x14
+        adc     x14, x14, xzr
+        cmn     x16, #1
+        eor     x15, x15, x16
+        adcs    x12, x15, x12
+        eor     x3, x3, x16
+        adcs    x13, x3, x13
+        adc     x14, x14, x16
+        ldp     x3, x4, [x1, #16]
+        ldp     x15, x16, [x1]
+        subs    x3, x3, x15
+        sbcs    x4, x4, x16
+        csetm   x16, lo
+        ldp     x15, x17, [x2]
+        subs    x5, x15, x5
+        sbcs    x6, x17, x6
+        csetm   x17, lo
+        eor     x3, x3, x16
+        subs    x3, x3, x16
+        eor     x4, x4, x16
+        sbc     x4, x4, x16
+        eor     x5, x5, x17
+        subs    x5, x5, x17
+        eor     x6, x6, x17
+        sbc     x6, x6, x17
+        eor     x16, x17, x16
+        adds    x11, x11, x9
+        adcs    x12, x12, x10
+        adcs    x13, x13, xzr
+        adc     x14, x14, xzr
+        mul     x2, x3, x5
+        umulh   x17, x3, x5
+        mul     x15, x4, x6
+        umulh   x1, x4, x6
+        subs    x4, x4, x3
+        cneg    x4, x4, lo
+        csetm   x9, lo
+        adds    x15, x15, x17
+        adc     x1, x1, xzr
+        subs    x6, x5, x6
+        cneg    x6, x6, lo
+        cinv    x9, x9, lo
+        mul     x5, x4, x6
+        umulh   x6, x4, x6
+        adds    x17, x2, x15
+        adcs    x15, x15, x1
+        adc     x1, x1, xzr
+        cmn     x9, #1
+        eor     x5, x5, x9
+        adcs    x17, x5, x17
+        eor     x6, x6, x9
+        adcs    x15, x6, x15
+        adc     x1, x1, x9
+        adds    x9, x11, x7
+        adcs    x10, x12, x8
+        adcs    x11, x13, x11
+        adcs    x12, x14, x12
+        adcs    x13, x13, xzr
+        adc     x14, x14, xzr
+        cmn     x16, #1
+        eor     x2, x2, x16
+        adcs    x9, x2, x9
+        eor     x17, x17, x16
+        adcs    x10, x17, x10
+        eor     x15, x15, x16
+        adcs    x11, x15, x11
+        eor     x1, x1, x16
+        adcs    x12, x1, x12
+        adcs    x13, x13, x16
+        adc     x14, x14, x16
+        mov     x3, #38
+        umull   x4, w11, w3
+        add     x4, x4, w7, uxtw
+        lsr     x7, x7, #32
+        lsr     x11, x11, #32
+        umaddl  x11, w11, w3, x7
+        mov     x7, x4
+        umull   x4, w12, w3
+        add     x4, x4, w8, uxtw
+        lsr     x8, x8, #32
+        lsr     x12, x12, #32
+        umaddl  x12, w12, w3, x8
+        mov     x8, x4
+        umull   x4, w13, w3
+        add     x4, x4, w9, uxtw
+        lsr     x9, x9, #32
+        lsr     x13, x13, #32
+        umaddl  x13, w13, w3, x9
+        mov     x9, x4
+        umull   x4, w14, w3
+        add     x4, x4, w10, uxtw
+        lsr     x10, x10, #32
+        lsr     x14, x14, #32
+        umaddl  x14, w14, w3, x10
+        mov     x10, x4
+        lsr     x17, x14, #31
+        mov     x5, #19
+        umaddl  x5, w5, w17, x5
+        add     x7, x7, x5
+        adds    x7, x7, x11, lsl #32
+        extr    x3, x12, x11, #32
+        adcs    x8, x8, x3
+        extr    x3, x13, x12, #32
+        adcs    x9, x9, x3
+        extr    x3, x14, x13, #32
+        lsl     x5, x17, #63
+        eor     x10, x10, x5
+        adc     x10, x10, x3
+        mov     x3, #19
+        tst     x10, #0x8000000000000000
+        csel    x3, x3, xzr, pl
+        subs    x7, x7, x3
+        sbcs    x8, x8, xzr
+        sbcs    x9, x9, xzr
+        sbc     x10, x10, xzr
+        and     x10, x10, #0x7fffffffffffffff
+        stp     x7, x8, [x0]
+        stp     x9, x10, [x0, #16]
+        ret
+
+// *************************************************************
+// Local z = 2^n * x
+// *************************************************************
+
+bignum_sqrt_p25519_nsqr_p25519:
+
+// Copy input argument into [x13;x12;x11;x10]
+
+        ldp     x10, x11, [x2]
+        ldp     x12, x13, [x2, #16]
+
+// Main squaring loop, accumulating in [x13;x12;x11;x10] consistently and
+// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38
+
+bignum_sqrt_p25519_loop:
+        umull   x2, w10, w10
+        lsr     x14, x10, #32
+        umull   x3, w14, w14
+        umull   x14, w10, w14
+        adds    x2, x2, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x3, x3, x14
+        umull   x4, w11, w11
+        lsr     x14, x11, #32
+        umull   x5, w14, w14
+        umull   x14, w11, w14
+        mul     x15, x10, x11
+        umulh   x16, x10, x11
+        adds    x4, x4, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x5, x5, x14
+        adds    x15, x15, x15
+        adcs    x16, x16, x16
+        adc     x5, x5, xzr
+        adds    x3, x3, x15
+        adcs    x4, x4, x16
+        adc     x5, x5, xzr
+        umull   x6, w12, w12
+        lsr     x14, x12, #32
+        umull   x7, w14, w14
+        umull   x14, w12, w14
+        adds    x6, x6, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x7, x7, x14
+        umull   x8, w13, w13
+        lsr     x14, x13, #32
+        umull   x9, w14, w14
+        umull   x14, w13, w14
+        mul     x15, x12, x13
+        umulh   x16, x12, x13
+        adds    x8, x8, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x9, x9, x14
+        adds    x15, x15, x15
+        adcs    x16, x16, x16
+        adc     x9, x9, xzr
+        adds    x7, x7, x15
+        adcs    x8, x8, x16
+        adc     x9, x9, xzr
+        subs    x10, x10, x12
+        sbcs    x11, x11, x13
+        csetm   x16, lo
+        eor     x10, x10, x16
+        subs    x10, x10, x16
+        eor     x11, x11, x16
+        sbc     x11, x11, x16
+        adds    x6, x6, x4
+        adcs    x7, x7, x5
+        adcs    x8, x8, xzr
+        adc     x9, x9, xzr
+        umull   x12, w10, w10
+        lsr     x5, x10, #32
+        umull   x13, w5, w5
+        umull   x5, w10, w5
+        adds    x12, x12, x5, lsl #33
+        lsr     x5, x5, #31
+        adc     x13, x13, x5
+        umull   x15, w11, w11
+        lsr     x5, x11, #32
+        umull   x14, w5, w5
+        umull   x5, w11, w5
+        mul     x4, x10, x11
+        umulh   x16, x10, x11
+        adds    x15, x15, x5, lsl #33
+        lsr     x5, x5, #31
+        adc     x14, x14, x5
+        adds    x4, x4, x4
+        adcs    x16, x16, x16
+        adc     x14, x14, xzr
+        adds    x13, x13, x4
+        adcs    x15, x15, x16
+        adc     x14, x14, xzr
+        adds    x4, x2, x6
+        adcs    x5, x3, x7
+        adcs    x6, x6, x8
+        adcs    x7, x7, x9
+        csetm   x16, lo
+        subs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x15
+        sbcs    x7, x7, x14
+        adcs    x8, x8, x16
+        adc     x9, x9, x16
+        mov     x10, #38
+        umull   x12, w6, w10
+        add     x12, x12, w2, uxtw
+        lsr     x2, x2, #32
+        lsr     x6, x6, #32
+        umaddl  x6, w6, w10, x2
+        mov     x2, x12
+        umull   x12, w7, w10
+        add     x12, x12, w3, uxtw
+        lsr     x3, x3, #32
+        lsr     x7, x7, #32
+        umaddl  x7, w7, w10, x3
+        mov     x3, x12
+        umull   x12, w8, w10
+        add     x12, x12, w4, uxtw
+        lsr     x4, x4, #32
+        lsr     x8, x8, #32
+        umaddl  x8, w8, w10, x4
+        mov     x4, x12
+        umull   x12, w9, w10
+        add     x12, x12, w5, uxtw
+        lsr     x5, x5, #32
+        lsr     x9, x9, #32
+        umaddl  x9, w9, w10, x5
+        mov     x5, x12
+        lsr     x13, x9, #31
+        mov     x11, #19
+        umull   x11, w11, w13
+        add     x2, x2, x11
+        adds    x10, x2, x6, lsl #32
+        extr    x12, x7, x6, #32
+        adcs    x11, x3, x12
+        extr    x12, x8, x7, #32
+        adcs    x12, x4, x12
+        extr    x14, x9, x8, #32
+        lsl     x15, x13, #63
+        eor     x5, x5, x15
+        adc     x13, x5, x14
+
+// Loop as applicable
+
+        subs    x1, x1, #1
+        bne     bignum_sqrt_p25519_loop
+
+// We know the intermediate result x < 2^256 - 38, and now we do strict
+// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255
+// which is equivalent to a "pl" condition.
+
+        adds    x6, x10, #19
+        adcs    x7, x11, xzr
+        adcs    x8, x12, xzr
+        adcs    x9, x13, xzr
+
+        csel    x10, x10, x6, pl
+        csel    x11, x11, x7, pl
+        csel    x12, x12, x8, pl
+        csel    x13, x13, x9, pl
+        bic     x13, x13, #0x8000000000000000
+
+// Copy result back into destination and return
+
+        stp     x10, x11, [x0]
+        stp     x12, x13, [x0, #16]
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519_alt.S
new file mode 100644
index 00000000000..ac33ef9a160
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519_alt.S
@@ -0,0 +1,473 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square root modulo p_25519 = 2^255 - 19
+// Input x[4]; output function return (Legendre symbol) and z[4]
+//
+// extern int64_t bignum_sqrt_p25519_alt(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Given a 4-digit input x, returns a modular square root mod p_25519, i.e.
+// a z such that z^2 == x (mod p_25519), whenever one exists. The square
+// root z is chosen so that its LSB is even (note that p_25519 - z is
+// another square root). The function return is the Legendre/Jacobi symbol
+// (x//p_25519), which indicates whether indeed x has a modular square root
+// and hence whether the result is meaningful:
+//
+//   0: x is divisible by p_25519 and z is the square root 0
+//  +1: x is coprime to p_25519 and z is a square root
+//  -1: x is coprime to p_25519 but not a quadratic residue
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqrt_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqrt_p25519_alt)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define a sp, #0
+#define b sp, #(4*N)
+#define s sp, #(8*N)
+#define t sp, #(12*N)
+
+// Other temporary variables in register
+
+#define res x19
+
+// Total size to reserve on the stack
+
+#define NSPACE #(16*N)
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+// Macros wrapping up calls to the local subroutines
+
+#define mulp(dest,src1,src2)                                            \
+        add     x0, dest __LF                                              \
+        add     x1, src1 __LF                                              \
+        add     x2, src2 __LF                                              \
+        bl      bignum_sqrt_p25519_alt_mul_p25519
+
+#define nsqr(dest,n,src)                                                \
+        add     x0, dest __LF                                              \
+        mov     x1, n __LF                                                 \
+        add     x2, src __LF                                               \
+        bl      bignum_sqrt_p25519_alt_nsqr_p25519
+
+S2N_BN_SYMBOL(bignum_sqrt_p25519_alt):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x30, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Set up reduced version of the input argument a = x mod p_25519. Then
+// get the candidate square root s = a^{252-2}
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        mov     x7, #19
+        lsr     x6, x5, #63
+        madd    x6, x7, x6, x7
+        adds    x2, x2, x6
+        adcs    x3, x3, xzr
+        adcs    x4, x4, xzr
+        orr     x5, x5, #0x8000000000000000
+        adcs    x5, x5, xzr
+        csel    x6, x7, xzr, lo
+        subs    x2, x2, x6
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, xzr
+        sbc     x5, x5, xzr
+        and     x5, x5, #0x7fffffffffffffff
+        stp     x2, x3, [a]
+        stp     x4, x5, [a+16]
+
+  // Power 2^2 - 1 = 3
+
+        nsqr(t,1,a)
+        mulp(t,t,a)
+
+  // Power 2^4 - 1 = 15
+
+        nsqr(s,2,t)
+        mulp(t,s,t)
+
+  // Power 2^5 - 1 = 31
+
+        nsqr(s,1,t)
+        mulp(b,s,a)
+
+  // Power 2^10 - 1
+
+        nsqr(s,5,b)
+        mulp(t,s,b)
+
+  // Power 2^20 - 1
+
+        nsqr(s,10,t)
+        mulp(t,s,t)
+
+  // Power 2^25 - 1
+
+        nsqr(s,5,t)
+        mulp(b,s,b)
+
+  // Power 2^50 - 1
+
+        nsqr(s,25,b)
+        mulp(t,s,b)
+
+  // Power 2^100 - 1
+        nsqr(s,50,t)
+        mulp(t,s,t)
+
+  // Power 2^125 - 1
+
+        nsqr(s,25,t)
+        mulp(b,s,b)
+
+  // Power 2^250 - 1
+
+        nsqr(s,125,b)
+        mulp(b,s,b)
+
+  // Power 2^251 - 1
+
+        nsqr(s,1,b)
+        mulp(t,s,a)
+
+  // Power 2^252 - 2
+
+        nsqr(s,1,t)
+
+// s is now one candidate square root. Generate the other one t = s * j_25519
+
+        movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0)
+        movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478)
+        movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7)
+        movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b)
+        stp     x0, x1, [t]
+        stp     x2, x3, [t+16]
+        mulp(t,s,t)
+
+// Now multiplex between them according to whether s^2 = a
+
+        nsqr(b,1,s)
+
+        ldp     x10, x11, [a]
+        ldp     x14, x15, [b]
+        eor     x10, x10, x14
+        eor     x11, x11, x15
+        orr     x10, x10, x11
+        ldp     x12, x13, [a+16]
+        ldp     x16, x17, [b+16]
+        eor     x12, x12, x16
+        eor     x13, x13, x17
+        orr     x12, x12, x13
+        orr     x10, x10, x12
+        cmp     x10, xzr
+
+        ldp     x10, x11, [s]
+        ldp     x14, x15, [t]
+        csel    x10, x10, x14, eq
+        csel    x11, x11, x15, eq
+        ldp     x12, x13, [s+16]
+        ldp     x16, x17, [t+16]
+        csel    x12, x12, x16, eq
+        csel    x13, x13, x17, eq
+
+// For definiteness, choose "positive" (LSB=0) square root
+
+        mov     x14, #-19
+        subs    x14, x14, x10
+        mov     x16, #-1
+        sbcs    x15, x16, x11
+        mov     x17, #0x7FFFFFFFFFFFFFFF
+        sbcs    x16, x16, x12
+        sbc     x17, x17, x13
+
+        tst     x10, #1
+        csel    x10, x10, x14, eq
+        csel    x11, x11, x15, eq
+        csel    x12, x12, x16, eq
+        csel    x13, x13, x17, eq
+
+        mov     x2, res
+        stp     x10, x11, [x2]
+        stp     x12, x13, [x2, #16]
+
+// Determine if it is is indeed a square root and also if a = 0
+// Hence return the Legendre-Jacobi symbol as required.
+
+        add     x0, b
+        mov     x1, #1
+        bl      bignum_sqrt_p25519_alt_nsqr_p25519
+
+        ldp     x10, x11, [a]
+        ldp     x14, x15, [b]
+        eor     x14, x10, x14
+        eor     x15, x11, x15
+        orr     x14, x14, x15
+        ldp     x12, x13, [a+16]
+        ldp     x16, x17, [b+16]
+        eor     x16, x12, x16
+        eor     x17, x13, x17
+        orr     x16, x16, x17
+        orr     x14, x14, x16
+        cmp     x14, xzr
+        mov     x0, #1
+        cneg    x0, x0, ne
+
+        orr     x10, x10, x11
+        orr     x12, x12, x13
+        orr     x10, x10, x12
+        cmp     x10, xzr
+        csel    x0, x0, xzr, ne
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x19, x30, [sp], 16
+        ret
+
+// *************************************************************
+// Local z = x * y
+// *************************************************************
+
+bignum_sqrt_p25519_alt_mul_p25519:
+        ldp     x3, x4, [x1]
+        ldp     x7, x8, [x2]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x2, #16]
+        mul     x11, x3, x9
+        umulh   x15, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x16, x3, x10
+        adcs    x15, x15, x11
+        adc     x16, x16, xzr
+        ldp     x5, x6, [x1, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x15, x15, x11
+        mul     x11, x4, x10
+        adcs    x16, x16, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x15, x15, x11
+        umulh   x11, x4, x9
+        adcs    x16, x16, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x15, x15, x11
+        mul     x11, x5, x9
+        adcs    x16, x16, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x15, x15, x11
+        umulh   x11, x5, x8
+        adcs    x16, x16, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x15, x15, x11
+        mul     x11, x6, x8
+        adcs    x16, x16, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x16, x16, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        mov     x7, #38
+        mul     x11, x7, x16
+        umulh   x9, x7, x16
+        adds    x12, x12, x11
+        mul     x11, x7, x3
+        umulh   x3, x7, x3
+        adcs    x13, x13, x11
+        mul     x11, x7, x4
+        umulh   x4, x7, x4
+        adcs    x14, x14, x11
+        mul     x11, x7, x5
+        umulh   x5, x7, x5
+        adcs    x15, x15, x11
+        cset    x16, hs
+        adds    x15, x15, x4
+        adc     x16, x16, x5
+        cmn     x15, x15
+        orr     x15, x15, #0x8000000000000000
+        adc     x8, x16, x16
+        mov     x7, #19
+        madd    x11, x7, x8, x7
+        adds    x12, x12, x11
+        adcs    x13, x13, x9
+        adcs    x14, x14, x3
+        adcs    x15, x15, xzr
+        csel    x7, x7, xzr, lo
+        subs    x12, x12, x7
+        sbcs    x13, x13, xzr
+        sbcs    x14, x14, xzr
+        sbc     x15, x15, xzr
+        and     x15, x15, #0x7fffffffffffffff
+        stp     x12, x13, [x0]
+        stp     x14, x15, [x0, #16]
+        ret
+
+// *************************************************************
+// Local z = 2^n * x
+// *************************************************************
+
+bignum_sqrt_p25519_alt_nsqr_p25519:
+
+// Copy input argument into [x5;x4;x3;x2] (overwriting input pointer x20
+
+        ldp     x6, x3, [x2]
+        ldp     x4, x5, [x2, #16]
+        mov     x2, x6
+
+// Main squaring loop, accumulating in [x5;x4;x3;x2] consistently and
+// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38
+
+bignum_sqrt_p25519_alt_loop:
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x7, x2, x4
+        umulh   x6, x2, x4
+        adds    x10, x10, x7
+        adcs    x11, x11, x6
+        mul     x7, x3, x4
+        umulh   x6, x3, x4
+        adc     x6, x6, xzr
+        adds    x11, x11, x7
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x6
+        mul     x7, x3, x5
+        umulh   x6, x3, x5
+        adc     x6, x6, xzr
+        adds    x12, x12, x7
+        adcs    x13, x13, x6
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x6, hs
+        umulh   x7, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x7
+        mul     x7, x3, x3
+        adcs    x10, x10, x7
+        umulh   x7, x3, x3
+        adcs    x11, x11, x7
+        mul     x7, x4, x4
+        adcs    x12, x12, x7
+        umulh   x7, x4, x4
+        adcs    x13, x13, x7
+        mul     x7, x5, x5
+        adcs    x14, x14, x7
+        umulh   x7, x5, x5
+        adc     x6, x6, x7
+        mov     x3, #38
+        mul     x7, x3, x12
+        umulh   x4, x3, x12
+        adds    x8, x8, x7
+        mul     x7, x3, x13
+        umulh   x13, x3, x13
+        adcs    x9, x9, x7
+        mul     x7, x3, x14
+        umulh   x14, x3, x14
+        adcs    x10, x10, x7
+        mul     x7, x3, x6
+        umulh   x6, x3, x6
+        adcs    x11, x11, x7
+        cset    x12, hs
+        adds    x11, x11, x14
+        adc     x12, x12, x6
+        cmn     x11, x11
+        bic     x11, x11, #0x8000000000000000
+        adc     x2, x12, x12
+        mov     x3, #0x13
+        mul     x7, x3, x2
+        adds    x2, x8, x7
+        adcs    x3, x9, x4
+        adcs    x4, x10, x13
+        adc     x5, x11, xzr
+
+// Loop as applicable
+
+        subs    x1, x1, #1
+        bne     bignum_sqrt_p25519_alt_loop
+
+// We know the intermediate result x < 2^256 - 38, and now we do strict
+// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255
+// which is equivalent to a "pl" condition.
+
+        adds    x6, x2, #19
+        adcs    x7, x3, xzr
+        adcs    x8, x4, xzr
+        adcs    x9, x5, xzr
+
+        csel    x2, x2, x6, pl
+        csel    x3, x3, x7, pl
+        csel    x4, x4, x8, pl
+        csel    x5, x5, x9, pl
+        bic     x5, x5, #0x8000000000000000
+
+// Copy result back into destination and return
+
+        stp     x2, x3, [x0]
+        stp     x4, x5, [x0, #16]
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sub_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sub_p25519.S
new file mode 100644
index 00000000000..001a2b45042
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sub_p25519.S
@@ -0,0 +1,68 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo p_25519, z := (x - y) mod p_25519
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_sub_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p25519)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+#define c x3
+#define l x4
+#define d0 x5
+#define d1 x6
+#define d2 x7
+#define d3 x8
+
+S2N_BN_SYMBOL(bignum_sub_p25519):
+
+// First just subtract the numbers as [d3; d2; d1; d0] = x - y,
+// with the inverted carry flag meaning CF <=> x >= y.
+
+        ldp     d0, d1, [x]
+        ldp     l, c, [y]
+        subs    d0, d0, l
+        sbcs    d1, d1, c
+        ldp     d2, d3, [x, #16]
+        ldp     l, c, [y, #16]
+        sbcs    d2, d2, l
+        sbcs    d3, d3, c
+
+// Now if x < y we want to add back p_25519, which staying within 255 bits
+// means subtracting 19, since p_25519 = 2^255 - 19.
+// Let c be that constant 19 when x < y, zero otherwise.
+
+        mov     l, #19
+        csel    c, l, xzr, cc
+
+// Correct by adding the optional constant and masking to 255 bits
+
+        subs    d0, d0, c
+        sbcs    d1, d1, xzr
+        sbcs    d2, d2, xzr
+        sbc     d3, d3, xzr
+        and     d3, d3, #0x7FFFFFFFFFFFFFFF
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep.S
new file mode 100644
index 00000000000..941c83f795e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep.S
@@ -0,0 +1,962 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery ladder step on pairs of (X,Z)-projective curve25519 points
+//
+// extern void curve25519_ladderstep
+//   (uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b)
+//
+// If point = (X,1) and pp = (n * (X,1),[n+1] * (X,1)) then the output
+// rr = (n' * (X,1),[n'+1] * (X,1)) where n' = 2 * n + b, with input
+// b assumed to be 0 or 1; in this setting, each pair (X,Z) is assumed to
+// be a projective y-free representation of an affine curve25519 point
+// (X/Z,y), with the initial "differential" point having Z = 1 and X its
+// affine x coordinate. In other words, the ladderstep operation is a
+// combination of doubling, differential addition and optional swapping.
+//
+// Standard ARM ABI: X0 = rr, X1 = point, X2 = pp, X3 = b
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_ladderstep)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_ladderstep)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define rr x17
+#define point x19
+#define pp x20
+#define b x21
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x point, #0
+#define z point, #NUMSIZE
+#define xn pp, #0
+#define zn pp, #NUMSIZE
+#define xm pp, #(2*NUMSIZE)
+#define zm pp, #(3*NUMSIZE)
+#define res0 rr, #0
+#define res1 rr, #NUMSIZE
+#define res2 rr, #(2*NUMSIZE)
+#define res3 rr, #(3*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define sm sp, #(0*NUMSIZE)
+#define sn sp, #(1*NUMSIZE)
+#define dm sp, #(2*NUMSIZE)
+#define dn sp, #(3*NUMSIZE)
+#define dmsn sp, #(4*NUMSIZE)
+#define dnsm sp, #(5*NUMSIZE)
+#define s sp, #(6*NUMSIZE)
+#define d sp, #(7*NUMSIZE)
+#define p sp, #(8*NUMSIZE)
+
+// More, but aliases to above
+
+#define sumx sm
+#define sumz sn
+#define dubx dm
+#define dubz dn
+#define e dubz
+#define spro dnsm
+#define dpro sumz
+
+// Total size to reserve on the stack
+
+#define NSPACE (9*NUMSIZE)
+
+// Macros wrapping up the basic field operations bignum_mul_p25519
+// and bignum_sqr_p25519, only trivially different from pure function
+// call to those subroutines.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+#define sqr_p25519(P0,P1)                       \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, cc __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, cc __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x10, #0x26 __LF                    \
+        umull   x12, w6, w10 __LF                  \
+        add     x12, x12, w2, uxtw __LF            \
+        lsr     x2, x2, #32 __LF                   \
+        lsr     x6, x6, #32 __LF                   \
+        umaddl  x6, w6, w10, x2 __LF               \
+        mov     x2, x12 __LF                       \
+        umull   x12, w7, w10 __LF                  \
+        add     x12, x12, w3, uxtw __LF            \
+        lsr     x3, x3, #32 __LF                   \
+        lsr     x7, x7, #32 __LF                   \
+        umaddl  x7, w7, w10, x3 __LF               \
+        mov     x3, x12 __LF                       \
+        umull   x12, w8, w10 __LF                  \
+        add     x12, x12, w4, uxtw __LF            \
+        lsr     x4, x4, #32 __LF                   \
+        lsr     x8, x8, #32 __LF                   \
+        umaddl  x8, w8, w10, x4 __LF               \
+        mov     x4, x12 __LF                       \
+        umull   x12, w9, w10 __LF                  \
+        add     x12, x12, w5, uxtw __LF            \
+        lsr     x5, x5, #32 __LF                   \
+        lsr     x9, x9, #32 __LF                   \
+        umaddl  x9, w9, w10, x5 __LF               \
+        mov     x5, x12 __LF                       \
+        lsr     x13, x9, #31 __LF                  \
+        mov     x11, #0x13 __LF                    \
+        umaddl  x11, w11, w13, x11 __LF            \
+        add     x2, x2, x11 __LF                   \
+        adds    x2, x2, x6, lsl #32 __LF           \
+        extr    x10, x7, x6, #32 __LF              \
+        adcs    x3, x3, x10 __LF                   \
+        extr    x10, x8, x7, #32 __LF              \
+        adcs    x4, x4, x10 __LF                   \
+        extr    x10, x9, x8, #32 __LF              \
+        lsl     x11, x13, #63 __LF                 \
+        eor     x5, x5, x11 __LF                   \
+        adc     x5, x5, x10 __LF                   \
+        mov     x10, #0x13 __LF                    \
+        tst     x5, #0x8000000000000000 __LF       \
+        csel    x10, x10, xzr, pl __LF             \
+        subs    x2, x2, x10 __LF                   \
+        sbcs    x3, x3, xzr __LF                   \
+        sbcs    x4, x4, xzr __LF                   \
+        sbc     x5, x5, xzr __LF                   \
+        and     x5, x5, #0x7fffffffffffffff __LF   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16]
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umull   x5, w5, w0 __LF                    \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, cc __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, cc __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x10, #0x26 __LF                    \
+        umull   x12, w6, w10 __LF                  \
+        add     x12, x12, w2, uxtw __LF            \
+        lsr     x2, x2, #32 __LF                   \
+        lsr     x6, x6, #32 __LF                   \
+        umaddl  x6, w6, w10, x2 __LF               \
+        mov     x2, x12 __LF                       \
+        umull   x12, w7, w10 __LF                  \
+        add     x12, x12, w3, uxtw __LF            \
+        lsr     x3, x3, #32 __LF                   \
+        lsr     x7, x7, #32 __LF                   \
+        umaddl  x7, w7, w10, x3 __LF               \
+        mov     x3, x12 __LF                       \
+        umull   x12, w8, w10 __LF                  \
+        add     x12, x12, w4, uxtw __LF            \
+        lsr     x4, x4, #32 __LF                   \
+        lsr     x8, x8, #32 __LF                   \
+        umaddl  x8, w8, w10, x4 __LF               \
+        mov     x4, x12 __LF                       \
+        umull   x12, w9, w10 __LF                  \
+        add     x12, x12, w5, uxtw __LF            \
+        lsr     x5, x5, #32 __LF                   \
+        lsr     x9, x9, #32 __LF                   \
+        umaddl  x9, w9, w10, x5 __LF               \
+        mov     x5, x12 __LF                       \
+        lsr     x13, x9, #31 __LF                  \
+        mov     x11, #0x13 __LF                    \
+        umull   x11, w11, w13 __LF                 \
+        add     x2, x2, x11 __LF                   \
+        adds    x2, x2, x6, lsl #32 __LF           \
+        extr    x10, x7, x6, #32 __LF              \
+        adcs    x3, x3, x10 __LF                   \
+        extr    x10, x8, x7, #32 __LF              \
+        adcs    x4, x4, x10 __LF                   \
+        extr    x10, x9, x8, #32 __LF              \
+        lsl     x11, x13, #63 __LF                 \
+        eor     x5, x5, x11 __LF                   \
+        adc     x5, x5, x10 __LF                   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16]
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result
+
+#define add_4(p0,p1,p2)                         \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x4, x5, [p2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [p1+16] __LF               \
+        ldp     x6, x7, [p2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [p0] __LF                  \
+        stp     x2, x3, [p0+16]
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(p0,p1,p2)                         \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x3, #19 __LF                       \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        mov     x4, #0x8000000000000000 __LF       \
+        sbc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [p0] __LF                  \
+        stp     x7, x8, [p0+16]
+
+// Modular addition with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(p0,p1,p2)                    \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [p0] __LF                  \
+        stp     x7, x8, [p0+16]
+
+// Combined z = c * x + y with reduction only < 2 * p_25519
+// where c is initially in the X1 register. It is assumed
+// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a
+// high mul in the final part.
+
+#define cmadd_4(p0,p2,p3)                       \
+        ldp     x7, x8, [p2] __LF                  \
+        ldp     x9, x10, [p2+16] __LF              \
+        mul     x3, x1, x7 __LF                    \
+        mul     x4, x1, x8 __LF                    \
+        mul     x5, x1, x9 __LF                    \
+        mul     x6, x1, x10 __LF                   \
+        umulh   x7, x1, x7 __LF                    \
+        umulh   x8, x1, x8 __LF                    \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        adds    x4, x4, x7 __LF                    \
+        adcs    x5, x5, x8 __LF                    \
+        adcs    x6, x6, x9 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        ldp     x7, x8, [p3] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x7, x8, [p3+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x6, x6 __LF                        \
+        bic     x6, x6, #0x8000000000000000 __LF   \
+        adc     x8, x10, x10 __LF                  \
+        mov     x9, #19 __LF                       \
+        mul     x7, x8, x9 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [p0] __LF                  \
+        stp     x5, x6, [p0+16]
+
+// Multiplex: z := if NZ then x else y
+
+#define mux_4(p0,p1,p2)                         \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x2, x3, [p2] __LF                  \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0] __LF                  \
+        ldp     x0, x1, [p1+16] __LF               \
+        ldp     x2, x3, [p2+16] __LF               \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0+16]
+
+// Paired multiplex: (w,z) := if NZ then (y,x) else (x,y)
+
+#define muxpair_4(p0,p1,p2,p3)                  \
+        ldp     x0, x1, [p2] __LF                  \
+        ldp     x2, x3, [p3] __LF                  \
+        csel    x4, x0, x2, eq __LF                \
+        csel    x6, x0, x2, ne __LF                \
+        csel    x5, x1, x3, eq __LF                \
+        csel    x7, x1, x3, ne __LF                \
+        stp     x4, x5, [p0] __LF                  \
+        stp     x6, x7, [p1] __LF                  \
+        ldp     x0, x1, [p2+16] __LF               \
+        ldp     x2, x3, [p3+16] __LF               \
+        csel    x4, x0, x2, eq __LF                \
+        csel    x6, x0, x2, ne __LF                \
+        csel    x5, x1, x3, eq __LF                \
+        csel    x7, x1, x3, ne __LF                \
+        stp     x4, x5, [p0+16] __LF               \
+        stp     x6, x7, [p1+16]
+
+S2N_BN_SYMBOL(curve25519_ladderstep):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x30, [sp, -16]!
+        stp     x20, x21, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     rr, x0
+        mov     point, x1
+        mov     pp, x2
+        mov     b, x3
+
+// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn
+// The adds don't need any normalization as they're fed to muls
+// Just make sure the subs fit in 4 digits
+
+        sub_4(dm, xm, zm)
+        add_4(sn, xn, zn)
+        sub_4(dn, xn, zn)
+        add_4(sm, xm, zm)
+
+// ADDING: dmsn = dm * sn; dnsm = sm * dn
+// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt)
+
+        mul_4(dmsn,dm,sn)
+
+        cmp     b, xzr
+        mux_4(d,dm,dn)
+        mux_4(s,sm,sn)
+
+        mul_4(dnsm,sm,dn)
+
+// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits
+
+        sqr_4(d,d)
+
+// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2
+// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits
+
+        sub_twice4(dpro,dmsn,dnsm)
+        sqr_4(s,s)
+        add_twice4(spro,dmsn,dnsm)
+        sqr_4(dpro,dpro)
+
+// DOUBLING: p = 4 * xt * zt = s - d
+
+        sub_twice4(p,s,d)
+
+// ADDING: sumx = (dmsn + dnsm)^2
+
+        sqr_p25519(sumx,spro)
+
+// DOUBLING: e = 121666 * p + d
+
+        mov     x1, 0xdb42
+        orr     x1, x1, 0x10000
+        cmadd_4(e,p,d)
+
+// DOUBLING: dubx = (xt + zt)^2 * (xt - zt)^2 = s * d
+
+        mul_p25519(dubx,s,d)
+
+// ADDING: sumz = x * (dmsn - dnsm)^2
+
+        mul_p25519(sumz,dpro,x)
+
+// DOUBLING: dubz = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt))
+//                = p * (d + 121666 * p)
+
+        mul_p25519(dubz,p,e)
+
+// Multiplex the outputs
+
+        cmp     b, xzr
+        muxpair_4(res0,res2,dubx,sumx)
+        muxpair_4(res1,res3,dubz,sumz)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x20, x21, [sp], 16
+        ldp     x19, x30, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep_alt.S
new file mode 100644
index 00000000000..9aaaf502cc8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep_alt.S
@@ -0,0 +1,686 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery ladder step on pairs of (X,Z)-projective curve25519 points
+//
+// extern void curve25519_ladderstep_alt
+//   (uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b)
+//
+// If point = (X,1) and pp = (n * (X,1),[n+1] * (X,1)) then the output
+// rr = (n' * (X,1),[n'+1] * (X,1)) where n' = 2 * n + b, with input
+// b assumed to be 0 or 1; in this setting, each pair (X,Z) is assumed to
+// be a projective y-free representation of an affine curve25519 point
+// (X/Z,y), with the initial "differential" point having Z = 1 and X its
+// affine x coordinate. In other words, the ladderstep operation is a
+// combination of doubling, differential addition and optional swapping.
+//
+// Standard ARM ABI: X0 = rr, X1 = point, X2 = pp, X3 = b
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_ladderstep_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_ladderstep_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define rr x17
+#define point x19
+#define pp x20
+#define b x21
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x point, #0
+#define z point, #NUMSIZE
+#define xn pp, #0
+#define zn pp, #NUMSIZE
+#define xm pp, #(2*NUMSIZE)
+#define zm pp, #(3*NUMSIZE)
+#define res0 rr, #0
+#define res1 rr, #NUMSIZE
+#define res2 rr, #(2*NUMSIZE)
+#define res3 rr, #(3*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define sm sp, #(0*NUMSIZE)
+#define sn sp, #(1*NUMSIZE)
+#define dm sp, #(2*NUMSIZE)
+#define dn sp, #(3*NUMSIZE)
+#define dmsn sp, #(4*NUMSIZE)
+#define dnsm sp, #(5*NUMSIZE)
+#define s sp, #(6*NUMSIZE)
+#define d sp, #(7*NUMSIZE)
+#define p sp, #(8*NUMSIZE)
+
+// More, but aliases to above
+
+#define sumx sm
+#define sumz sn
+#define dubx dm
+#define dubz dn
+#define e dubz
+#define spro dnsm
+#define dpro sumz
+
+// Total size to reserve on the stack
+
+#define NSPACE (9*NUMSIZE)
+
+// Macros wrapping up the basic field operations bignum_mul_p25519_alt
+// and bignum_sqr_p25519_alt, only trivially different from pure function
+// call to those subroutines.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+#define sqr_p25519(P0,P1)                       \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x26 __LF                     \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        cmn     x11, x11 __LF                      \
+        orr     x11, x11, #0x8000000000000000 __LF \
+        adc     x2, x12, x12 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        madd    x7, x3, x2, x3 __LF                \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adcs    x11, x11, xzr __LF                 \
+        csel    x3, x3, xzr, cc __LF               \
+        subs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        and     x11, x11, #0x7fffffffffffffff __LF \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16] __LF             \
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x26 __LF                     \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        cmn     x11, x11 __LF                      \
+        bic     x11, x11, #0x8000000000000000 __LF \
+        adc     x2, x12, x12 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        mul     x7, x3, x2 __LF                    \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result
+
+#define add_4(p0,p1,p2)                         \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x4, x5, [p2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [p1+16] __LF               \
+        ldp     x6, x7, [p2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [p0] __LF                  \
+        stp     x2, x3, [p0+16]
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(p0,p1,p2)                         \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x3, #19 __LF                       \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        mov     x4, #0x8000000000000000 __LF       \
+        sbc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [p0] __LF                  \
+        stp     x7, x8, [p0+16]
+
+// Modular addition with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(p0,p1,p2)                    \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [p0] __LF                  \
+        stp     x7, x8, [p0+16]
+
+// Combined z = c * x + y with reduction only < 2 * p_25519
+// where c is initially in the X1 register. It is assumed
+// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a
+// high mul in the final part.
+
+#define cmadd_4(p0,p2,p3)                       \
+        ldp     x7, x8, [p2] __LF                  \
+        ldp     x9, x10, [p2+16] __LF              \
+        mul     x3, x1, x7 __LF                    \
+        mul     x4, x1, x8 __LF                    \
+        mul     x5, x1, x9 __LF                    \
+        mul     x6, x1, x10 __LF                   \
+        umulh   x7, x1, x7 __LF                    \
+        umulh   x8, x1, x8 __LF                    \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        adds    x4, x4, x7 __LF                    \
+        adcs    x5, x5, x8 __LF                    \
+        adcs    x6, x6, x9 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        ldp     x7, x8, [p3] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x7, x8, [p3+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x6, x6 __LF                        \
+        bic     x6, x6, #0x8000000000000000 __LF   \
+        adc     x8, x10, x10 __LF                  \
+        mov     x9, #19 __LF                       \
+        mul     x7, x8, x9 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [p0] __LF                  \
+        stp     x5, x6, [p0+16]
+
+// Multiplex: z := if NZ then x else y
+
+#define mux_4(p0,p1,p2)                         \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x2, x3, [p2] __LF                  \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0] __LF                  \
+        ldp     x0, x1, [p1+16] __LF               \
+        ldp     x2, x3, [p2+16] __LF               \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0+16]
+
+// Paired multiplex: (w,z) := if NZ then (y,x) else (x,y)
+
+#define muxpair_4(p0,p1,p2,p3)                  \
+        ldp     x0, x1, [p2] __LF                  \
+        ldp     x2, x3, [p3] __LF                  \
+        csel    x4, x0, x2, eq __LF                \
+        csel    x6, x0, x2, ne __LF                \
+        csel    x5, x1, x3, eq __LF                \
+        csel    x7, x1, x3, ne __LF                \
+        stp     x4, x5, [p0] __LF                  \
+        stp     x6, x7, [p1] __LF                  \
+        ldp     x0, x1, [p2+16] __LF               \
+        ldp     x2, x3, [p3+16] __LF               \
+        csel    x4, x0, x2, eq __LF                \
+        csel    x6, x0, x2, ne __LF                \
+        csel    x5, x1, x3, eq __LF                \
+        csel    x7, x1, x3, ne __LF                \
+        stp     x4, x5, [p0+16] __LF               \
+        stp     x6, x7, [p1+16]
+
+S2N_BN_SYMBOL(curve25519_ladderstep_alt):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x30, [sp, -16]!
+        stp     x20, x21, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     rr, x0
+        mov     point, x1
+        mov     pp, x2
+        mov     b, x3
+
+// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn
+// The adds don't need any normalization as they're fed to muls
+// Just make sure the subs fit in 4 digits
+
+        sub_4(dm, xm, zm)
+        add_4(sn, xn, zn)
+        sub_4(dn, xn, zn)
+        add_4(sm, xm, zm)
+
+// ADDING: dmsn = dm * sn; dnsm = sm * dn
+// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt)
+
+        mul_4(dmsn,dm,sn)
+
+        cmp     b, xzr
+        mux_4(d,dm,dn)
+        mux_4(s,sm,sn)
+
+        mul_4(dnsm,sm,dn)
+
+// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits
+
+        sqr_4(d,d)
+
+// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2
+// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits
+
+        sub_twice4(dpro,dmsn,dnsm)
+        sqr_4(s,s)
+        add_twice4(spro,dmsn,dnsm)
+        sqr_4(dpro,dpro)
+
+// DOUBLING: p = 4 * xt * zt = s - d
+
+        sub_twice4(p,s,d)
+
+// ADDING: sumx = (dmsn + dnsm)^2
+
+        sqr_p25519(sumx,spro)
+
+// DOUBLING: e = 121666 * p + d
+
+        mov     x1, 0xdb42
+        orr     x1, x1, 0x10000
+        cmadd_4(e,p,d)
+
+// DOUBLING: dubx = (xt + zt)^2 * (xt - zt)^2 = s * d
+
+        mul_p25519(dubx,s,d)
+
+// ADDING: sumz = x * (dmsn - dnsm)^2
+
+        mul_p25519(sumz,dpro,x)
+
+// DOUBLING: dubz = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt))
+//                = p * (d + 121666 * p)
+
+        mul_p25519(dubz,p,e)
+
+// Multiplex the outputs
+
+        cmp     b, xzr
+        muxpair_4(res0,res2,dubx,sumx)
+        muxpair_4(res1,res3,dubz,sumz)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x20, x21, [sp], 16
+        ldp     x19, x30, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul.S
new file mode 100644
index 00000000000..b28051467be
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul.S
@@ -0,0 +1,995 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Projective scalar multiplication, x coordinate only, for curve25519
+// Inputs scalar[4], point[4]; output res[8]
+//
+// extern void curve25519_pxscalarmul
+//   (uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4])
+//
+// Given the X coordinate of an input point = (X,Y) on curve25519, which
+// could also be part of a projective representation (X,Y,1) of the same
+// point, returns a projective representation (X,Z) = scalar * point, where
+// scalar is a 256-bit number. The corresponding affine form is (X/Z,Y'),
+// X/Z meaning division modulo 2^255-19, and Y' not being computed by
+// this function (nor is any Y coordinate of the input point used).
+//
+// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_pxscalarmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_pxscalarmul)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+// and additional registers for loop counter and swap flag
+
+#define res x17
+#define point x19
+#define scalar x20
+#define i x21
+#define swap x22
+
+// Pointers to input x coord (we don't use y or z) and output coords.
+
+#define x point, #0
+#define resx res, #0
+#define resz res, #NUMSIZE
+
+// Pointer-offset pairs for temporaries on stack with some aliasing.
+
+#define zm sp, #(0*NUMSIZE)
+#define sm sp, #(0*NUMSIZE)
+#define dpro sp, #(0*NUMSIZE)
+
+#define sn sp, #(1*NUMSIZE)
+
+#define dm sp, #(2*NUMSIZE)
+
+#define zn sp, #(3*NUMSIZE)
+#define dn sp, #(3*NUMSIZE)
+#define e sp, #(3*NUMSIZE)
+
+#define dmsn sp, #(4*NUMSIZE)
+#define p sp, #(4*NUMSIZE)
+
+#define xm sp, #(5*NUMSIZE)
+#define dnsm sp, #(5*NUMSIZE)
+#define spro sp, #(5*NUMSIZE)
+
+#define xn sp, #(6*NUMSIZE)
+#define s sp, #(6*NUMSIZE)
+
+#define d sp, #(7*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (8*NUMSIZE)
+
+// Macros wrapping up the basic field operations bignum_mul_p25519
+// and bignum_sqr_p25519, only trivially different from pure function
+// call to those subroutines.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+#define sqr_p25519(P0,P1)                       \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, cc __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, cc __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x10, #0x26 __LF                    \
+        umull   x12, w6, w10 __LF                  \
+        add     x12, x12, w2, uxtw __LF            \
+        lsr     x2, x2, #32 __LF                   \
+        lsr     x6, x6, #32 __LF                   \
+        umaddl  x6, w6, w10, x2 __LF               \
+        mov     x2, x12 __LF                       \
+        umull   x12, w7, w10 __LF                  \
+        add     x12, x12, w3, uxtw __LF            \
+        lsr     x3, x3, #32 __LF                   \
+        lsr     x7, x7, #32 __LF                   \
+        umaddl  x7, w7, w10, x3 __LF               \
+        mov     x3, x12 __LF                       \
+        umull   x12, w8, w10 __LF                  \
+        add     x12, x12, w4, uxtw __LF            \
+        lsr     x4, x4, #32 __LF                   \
+        lsr     x8, x8, #32 __LF                   \
+        umaddl  x8, w8, w10, x4 __LF               \
+        mov     x4, x12 __LF                       \
+        umull   x12, w9, w10 __LF                  \
+        add     x12, x12, w5, uxtw __LF            \
+        lsr     x5, x5, #32 __LF                   \
+        lsr     x9, x9, #32 __LF                   \
+        umaddl  x9, w9, w10, x5 __LF               \
+        mov     x5, x12 __LF                       \
+        lsr     x13, x9, #31 __LF                  \
+        mov     x11, #0x13 __LF                    \
+        umaddl  x11, w11, w13, x11 __LF            \
+        add     x2, x2, x11 __LF                   \
+        adds    x2, x2, x6, lsl #32 __LF           \
+        extr    x10, x7, x6, #32 __LF              \
+        adcs    x3, x3, x10 __LF                   \
+        extr    x10, x8, x7, #32 __LF              \
+        adcs    x4, x4, x10 __LF                   \
+        extr    x10, x9, x8, #32 __LF              \
+        lsl     x11, x13, #63 __LF                 \
+        eor     x5, x5, x11 __LF                   \
+        adc     x5, x5, x10 __LF                   \
+        mov     x10, #0x13 __LF                    \
+        tst     x5, #0x8000000000000000 __LF       \
+        csel    x10, x10, xzr, pl __LF             \
+        subs    x2, x2, x10 __LF                   \
+        sbcs    x3, x3, xzr __LF                   \
+        sbcs    x4, x4, xzr __LF                   \
+        sbc     x5, x5, xzr __LF                   \
+        and     x5, x5, #0x7fffffffffffffff __LF   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16]
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umull   x5, w5, w0 __LF                    \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, cc __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, cc __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x10, #0x26 __LF                    \
+        umull   x12, w6, w10 __LF                  \
+        add     x12, x12, w2, uxtw __LF            \
+        lsr     x2, x2, #32 __LF                   \
+        lsr     x6, x6, #32 __LF                   \
+        umaddl  x6, w6, w10, x2 __LF               \
+        mov     x2, x12 __LF                       \
+        umull   x12, w7, w10 __LF                  \
+        add     x12, x12, w3, uxtw __LF            \
+        lsr     x3, x3, #32 __LF                   \
+        lsr     x7, x7, #32 __LF                   \
+        umaddl  x7, w7, w10, x3 __LF               \
+        mov     x3, x12 __LF                       \
+        umull   x12, w8, w10 __LF                  \
+        add     x12, x12, w4, uxtw __LF            \
+        lsr     x4, x4, #32 __LF                   \
+        lsr     x8, x8, #32 __LF                   \
+        umaddl  x8, w8, w10, x4 __LF               \
+        mov     x4, x12 __LF                       \
+        umull   x12, w9, w10 __LF                  \
+        add     x12, x12, w5, uxtw __LF            \
+        lsr     x5, x5, #32 __LF                   \
+        lsr     x9, x9, #32 __LF                   \
+        umaddl  x9, w9, w10, x5 __LF               \
+        mov     x5, x12 __LF                       \
+        lsr     x13, x9, #31 __LF                  \
+        mov     x11, #0x13 __LF                    \
+        umull   x11, w11, w13 __LF                 \
+        add     x2, x2, x11 __LF                   \
+        adds    x2, x2, x6, lsl #32 __LF           \
+        extr    x10, x7, x6, #32 __LF              \
+        adcs    x3, x3, x10 __LF                   \
+        extr    x10, x8, x7, #32 __LF              \
+        adcs    x4, x4, x10 __LF                   \
+        extr    x10, x9, x8, #32 __LF              \
+        lsl     x11, x13, #63 __LF                 \
+        eor     x5, x5, x11 __LF                   \
+        adc     x5, x5, x10 __LF                   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16]
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result
+
+#define add_4(p0,p1,p2)                         \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x4, x5, [p2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [p1+16] __LF               \
+        ldp     x6, x7, [p2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [p0] __LF                  \
+        stp     x2, x3, [p0+16]
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(p0,p1,p2)                         \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x3, #19 __LF                       \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        mov     x4, #0x8000000000000000 __LF       \
+        sbc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [p0] __LF                  \
+        stp     x7, x8, [p0+16]
+
+// Modular addition with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(p0,p1,p2)                    \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [p0] __LF                  \
+        stp     x7, x8, [p0+16]
+
+// Combined z = c * x + y with reduction only < 2 * p_25519
+// where c is initially in the X1 register. It is assumed
+// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a
+// high mul in the final part.
+
+#define cmadd_4(p0,p2,p3)                       \
+        ldp     x7, x8, [p2] __LF                  \
+        ldp     x9, x10, [p2+16] __LF              \
+        mul     x3, x1, x7 __LF                    \
+        mul     x4, x1, x8 __LF                    \
+        mul     x5, x1, x9 __LF                    \
+        mul     x6, x1, x10 __LF                   \
+        umulh   x7, x1, x7 __LF                    \
+        umulh   x8, x1, x8 __LF                    \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        adds    x4, x4, x7 __LF                    \
+        adcs    x5, x5, x8 __LF                    \
+        adcs    x6, x6, x9 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        ldp     x7, x8, [p3] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x7, x8, [p3+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x6, x6 __LF                        \
+        bic     x6, x6, #0x8000000000000000 __LF   \
+        adc     x8, x10, x10 __LF                  \
+        mov     x9, #19 __LF                       \
+        mul     x7, x8, x9 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [p0] __LF                  \
+        stp     x5, x6, [p0+16]
+
+// Multiplex: z := if NZ then x else y
+
+#define mux_4(p0,p1,p2)                         \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x2, x3, [p2] __LF                  \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0] __LF                  \
+        ldp     x0, x1, [p1+16] __LF               \
+        ldp     x2, x3, [p2+16] __LF               \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0+16]
+
+S2N_BN_SYMBOL(curve25519_pxscalarmul):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x22, [sp, -16]!
+        stp     x20, x21, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     res, x0
+        mov     scalar, x1
+        mov     point, x2
+
+// Initialize (xn,zn) = (1,0) and (xm,zm) = (x,1) with swap = 0
+
+        mov     x2, #1
+        stp     x2, xzr, [xn]
+        stp     xzr, xzr, [xn+16]
+        stp     xzr, xzr, [zn]
+        stp     xzr, xzr, [zn+16]
+        ldp     x0, x1, [x]
+        stp     x0, x1, [xm]
+        ldp     x0, x1, [x+16]
+        stp     x0, x1, [xm+16]
+        ldp     x0, x1, [x+32]
+        stp     x2, xzr, [zm]
+        stp     xzr, xzr, [zm+16]
+        mov     swap, xzr
+
+// The outer loop from i = 255, ..., i = 0 (inclusive)
+
+        mov     i, #255
+
+curve25519_pxscalarmul_loop:
+
+// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn
+// The adds don't need any normalization as they're fed to muls
+// Just make sure the subs fit in 4 digits
+
+        sub_4(dm, xm, zm)
+        add_4(sn, xn, zn)
+        sub_4(dn, xn, zn)
+        add_4(sm, xm, zm)
+
+// ADDING: dmsn = dm * sn; dnsm = sm * dn
+// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt)
+
+        mul_4(dmsn,sn,dm)
+
+        lsr     x0, i, #6
+        ldr     x2, [scalar, x0, lsl #3]
+        lsr     x2, x2, i
+        and     x2, x2, #1
+
+        cmp     swap, x2
+        mov     swap, x2
+
+        mux_4(d,dm,dn)
+        mux_4(s,sm,sn)
+
+        mul_4(dnsm,sm,dn)
+
+// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits
+
+        sqr_4(d,d)
+
+// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2
+// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits
+
+        sub_twice4(dpro,dmsn,dnsm)
+        sqr_4(s,s)
+        add_twice4(spro,dmsn,dnsm)
+        sqr_4(dpro,dpro)
+
+// DOUBLING: p = 4 * xt * zt = s - d
+
+        sub_twice4(p,s,d)
+
+// ADDING: xm' = (dmsn + dnsm)^2
+
+        sqr_p25519(xm,spro)
+
+// DOUBLING: e = 121666 * p + d
+
+        mov     x1, 0xdb42
+        orr     x1, x1, 0x10000
+        cmadd_4(e,p,d)
+
+// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d
+
+        mul_p25519(xn,s,d)
+
+// ADDING: zm' = x * (dmsn - dnsm)^2
+
+        mul_p25519(zm,dpro,x)
+
+// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt))
+//               = p * (d + 121666 * p)
+
+        mul_p25519(zn,p,e)
+
+// Loop down as far as 0 (inclusive)
+
+        subs    i, i, #1
+        bcs     curve25519_pxscalarmul_loop
+
+// The main loop does not handle the special input of the 2-torsion
+// point = (0,0). In that case we may get a spurious (0,0) as output
+// when we want (0,1) [for odd scalar] or (1,0) [for even scalar].
+// Test if x = 0 (this is equivalent for curve25519 to y = 0) and if
+// so, patch zm = 1 [for odd multiple], xn = 1 [for even multiple].
+
+        ldp     x0, x1, [point]
+        orr     x0, x0, x1
+        ldp     x2, x3, [point, #16]
+        orr     x2, x2, x3
+        orr     x0, x0, x2
+        cmp     x0, xzr
+        cset    x0, eq
+        ldr     x1, [zm]
+        orr     x1, x1, x0
+        str     x1, [zm]
+        ldr     x2, [xn]
+        orr     x2, x2, x0
+        str     x2, [xn]
+
+// Multiplex into the final outputs
+
+        cmp     swap, xzr
+
+        mux_4(resx,xm,xn)
+        mux_4(resz,zm,zn)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x20, x21, [sp], 16
+        ldp     x19, x22, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul_alt.S
new file mode 100644
index 00000000000..ef62e32cf1e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul_alt.S
@@ -0,0 +1,719 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Projective scalar multiplication, x coordinate only, for curve25519
+// Inputs scalar[4], point[4]; output res[8]
+//
+// extern void curve25519_pxscalarmul_alt
+//   (uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4])
+//
+// Given the X coordinate of an input point = (X,Y) on curve25519, which
+// could also be part of a projective representation (X,Y,1) of the same
+// point, returns a projective representation (X,Z) = scalar * point, where
+// scalar is a 256-bit number. The corresponding affine form is (X/Z,Y'),
+// X/Z meaning division modulo 2^255-19, and Y' not being computed by
+// this function (nor is any Y coordinate of the input point used).
+//
+// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_pxscalarmul_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_pxscalarmul_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+// and additional registers for loop counter and swap flag
+
+#define res x17
+#define point x19
+#define scalar x20
+#define i x21
+#define swap x22
+
+// Pointers to input x coord (we don't use y or z) and output coords.
+
+#define x point, #0
+#define resx res, #0
+#define resz res, #NUMSIZE
+
+// Pointer-offset pairs for temporaries on stack with some aliasing.
+
+#define zm sp, #(0*NUMSIZE)
+#define sm sp, #(0*NUMSIZE)
+#define dpro sp, #(0*NUMSIZE)
+
+#define sn sp, #(1*NUMSIZE)
+
+#define dm sp, #(2*NUMSIZE)
+
+#define zn sp, #(3*NUMSIZE)
+#define dn sp, #(3*NUMSIZE)
+#define e sp, #(3*NUMSIZE)
+
+#define dmsn sp, #(4*NUMSIZE)
+#define p sp, #(4*NUMSIZE)
+
+#define xm sp, #(5*NUMSIZE)
+#define dnsm sp, #(5*NUMSIZE)
+#define spro sp, #(5*NUMSIZE)
+
+#define xn sp, #(6*NUMSIZE)
+#define s sp, #(6*NUMSIZE)
+
+#define d sp, #(7*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (8*NUMSIZE)
+
+// Macros wrapping up the basic field operations bignum_mul_p25519_alt
+// and bignum_sqr_p25519_alt, only trivially different from pure function
+// call to those subroutines.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+#define sqr_p25519(P0,P1)                       \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x26 __LF                     \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        cmn     x11, x11 __LF                      \
+        orr     x11, x11, #0x8000000000000000 __LF \
+        adc     x2, x12, x12 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        madd    x7, x3, x2, x3 __LF                \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adcs    x11, x11, xzr __LF                 \
+        csel    x3, x3, xzr, cc __LF               \
+        subs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        and     x11, x11, #0x7fffffffffffffff __LF \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16] __LF             \
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x26 __LF                     \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        cmn     x11, x11 __LF                      \
+        bic     x11, x11, #0x8000000000000000 __LF \
+        adc     x2, x12, x12 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        mul     x7, x3, x2 __LF                    \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result
+
+#define add_4(p0,p1,p2)                         \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x4, x5, [p2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [p1+16] __LF               \
+        ldp     x6, x7, [p2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [p0] __LF                  \
+        stp     x2, x3, [p0+16]
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(p0,p1,p2)                         \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x3, #19 __LF                       \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        mov     x4, #0x8000000000000000 __LF       \
+        sbc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [p0] __LF                  \
+        stp     x7, x8, [p0+16]
+
+// Modular addition with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(p0,p1,p2)                    \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [p0] __LF                  \
+        stp     x7, x8, [p0+16]
+
+// Combined z = c * x + y with reduction only < 2 * p_25519
+// where c is initially in the X1 register. It is assumed
+// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a
+// high mul in the final part.
+
+#define cmadd_4(p0,p2,p3)                       \
+        ldp     x7, x8, [p2] __LF                  \
+        ldp     x9, x10, [p2+16] __LF              \
+        mul     x3, x1, x7 __LF                    \
+        mul     x4, x1, x8 __LF                    \
+        mul     x5, x1, x9 __LF                    \
+        mul     x6, x1, x10 __LF                   \
+        umulh   x7, x1, x7 __LF                    \
+        umulh   x8, x1, x8 __LF                    \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        adds    x4, x4, x7 __LF                    \
+        adcs    x5, x5, x8 __LF                    \
+        adcs    x6, x6, x9 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        ldp     x7, x8, [p3] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x7, x8, [p3+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x6, x6 __LF                        \
+        bic     x6, x6, #0x8000000000000000 __LF   \
+        adc     x8, x10, x10 __LF                  \
+        mov     x9, #19 __LF                       \
+        mul     x7, x8, x9 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [p0] __LF                  \
+        stp     x5, x6, [p0+16]
+
+// Multiplex: z := if NZ then x else y
+
+#define mux_4(p0,p1,p2)                         \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x2, x3, [p2] __LF                  \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0] __LF                  \
+        ldp     x0, x1, [p1+16] __LF               \
+        ldp     x2, x3, [p2+16] __LF               \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0+16]
+
+S2N_BN_SYMBOL(curve25519_pxscalarmul_alt):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x22, [sp, -16]!
+        stp     x20, x21, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     res, x0
+        mov     scalar, x1
+        mov     point, x2
+
+// Initialize (xn,zn) = (1,0) and (xm,zm) = (x,1) with swap = 0
+
+        mov     x2, #1
+        stp     x2, xzr, [xn]
+        stp     xzr, xzr, [xn+16]
+        stp     xzr, xzr, [zn]
+        stp     xzr, xzr, [zn+16]
+        ldp     x0, x1, [x]
+        stp     x0, x1, [xm]
+        ldp     x0, x1, [x+16]
+        stp     x0, x1, [xm+16]
+        ldp     x0, x1, [x+32]
+        stp     x2, xzr, [zm]
+        stp     xzr, xzr, [zm+16]
+        mov     swap, xzr
+
+// The outer loop from i = 255, ..., i = 0 (inclusive)
+
+        mov     i, #255
+
+curve25519_pxscalarmul_alt_loop:
+
+// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn
+// The adds don't need any normalization as they're fed to muls
+// Just make sure the subs fit in 4 digits
+
+        sub_4(dm, xm, zm)
+        add_4(sn, xn, zn)
+        sub_4(dn, xn, zn)
+        add_4(sm, xm, zm)
+
+// ADDING: dmsn = dm * sn; dnsm = sm * dn
+// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt)
+
+        mul_4(dmsn,sn,dm)
+
+        lsr     x0, i, #6
+        ldr     x2, [scalar, x0, lsl #3]
+        lsr     x2, x2, i
+        and     x2, x2, #1
+
+        cmp     swap, x2
+        mov     swap, x2
+
+        mux_4(d,dm,dn)
+        mux_4(s,sm,sn)
+
+        mul_4(dnsm,sm,dn)
+
+// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits
+
+        sqr_4(d,d)
+
+// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2
+// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits
+
+        sub_twice4(dpro,dmsn,dnsm)
+        sqr_4(s,s)
+        add_twice4(spro,dmsn,dnsm)
+        sqr_4(dpro,dpro)
+
+// DOUBLING: p = 4 * xt * zt = s - d
+
+        sub_twice4(p,s,d)
+
+// ADDING: xm' = (dmsn + dnsm)^2
+
+        sqr_p25519(xm,spro)
+
+// DOUBLING: e = 121666 * p + d
+
+        mov     x1, 0xdb42
+        orr     x1, x1, 0x10000
+        cmadd_4(e,p,d)
+
+// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d
+
+        mul_p25519(xn,s,d)
+
+// ADDING: zm' = x * (dmsn - dnsm)^2
+
+        mul_p25519(zm,dpro,x)
+
+// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt))
+//               = p * (d + 121666 * p)
+
+        mul_p25519(zn,p,e)
+
+// Loop down as far as 0 (inclusive)
+
+        subs    i, i, #1
+        bcs     curve25519_pxscalarmul_alt_loop
+
+// The main loop does not handle the special input of the 2-torsion
+// point = (0,0). In that case we may get a spurious (0,0) as output
+// when we want (0,1) [for odd scalar] or (1,0) [for even scalar].
+// Test if x = 0 (this is equivalent for curve25519 to y = 0) and if
+// so, patch zm = 1 [for odd multiple], xn = 1 [for even multiple].
+
+        ldp     x0, x1, [point]
+        orr     x0, x0, x1
+        ldp     x2, x3, [point, #16]
+        orr     x2, x2, x3
+        orr     x0, x0, x2
+        cmp     x0, xzr
+        cset    x0, eq
+        ldr     x1, [zm]
+        orr     x1, x1, x0
+        str     x1, [zm]
+        ldr     x2, [xn]
+        orr     x2, x2, x0
+        str     x2, [xn]
+
+// Multiplex into the final outputs
+
+        cmp     swap, xzr
+
+        mux_4(resx,xm,xn)
+        mux_4(resz,zm,zn)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x20, x21, [sp], 16
+        ldp     x19, x22, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519.S
similarity index 100%
rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519.S
diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_alt.S
similarity index 69%
rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_alt.S
index 518cb895555..99c2bcced39 100644
--- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_alt.S
@@ -79,204 +79,204 @@
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        orr     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        madd    x11, x7, x8, x7;                \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adcs    x15, x15, xzr;                  \
-        csel    x7, x7, xzr, cc;                \
-        subs    x12, x12, x7;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbc     x15, x15, xzr;                  \
-        and     x15, x15, #0x7fffffffffffffff;  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        bic     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        mul     x11, x7, x8;                    \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adc     x15, x15, xzr;                  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // Squaring just giving a result < 2 * p_25519, which is done by
@@ -284,77 +284,77 @@
 // optional correction.
 
 #define sqr_4(P0,P1)                            \
-        ldp     x2, x3, [P1];                   \
-        mul     x9, x2, x3;                     \
-        umulh   x10, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x11, x2, x5;                    \
-        umulh   x12, x2, x5;                    \
-        mul     x7, x2, x4;                     \
-        umulh   x6, x2, x4;                     \
-        adds    x10, x10, x7;                   \
-        adcs    x11, x11, x6;                   \
-        mul     x7, x3, x4;                     \
-        umulh   x6, x3, x4;                     \
-        adc     x6, x6, xzr;                    \
-        adds    x11, x11, x7;                   \
-        mul     x13, x4, x5;                    \
-        umulh   x14, x4, x5;                    \
-        adcs    x12, x12, x6;                   \
-        mul     x7, x3, x5;                     \
-        umulh   x6, x3, x5;                     \
-        adc     x6, x6, xzr;                    \
-        adds    x12, x12, x7;                   \
-        adcs    x13, x13, x6;                   \
-        adc     x14, x14, xzr;                  \
-        adds    x9, x9, x9;                     \
-        adcs    x10, x10, x10;                  \
-        adcs    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        cset    x6, cs;                         \
-        umulh   x7, x2, x2;                     \
-        mul     x8, x2, x2;                     \
-        adds    x9, x9, x7;                     \
-        mul     x7, x3, x3;                     \
-        adcs    x10, x10, x7;                   \
-        umulh   x7, x3, x3;                     \
-        adcs    x11, x11, x7;                   \
-        mul     x7, x4, x4;                     \
-        adcs    x12, x12, x7;                   \
-        umulh   x7, x4, x4;                     \
-        adcs    x13, x13, x7;                   \
-        mul     x7, x5, x5;                     \
-        adcs    x14, x14, x7;                   \
-        umulh   x7, x5, x5;                     \
-        adc     x6, x6, x7;                     \
-        mov     x3, #0x26;                      \
-        mul     x7, x3, x12;                    \
-        umulh   x4, x3, x12;                    \
-        adds    x8, x8, x7;                     \
-        mul     x7, x3, x13;                    \
-        umulh   x13, x3, x13;                   \
-        adcs    x9, x9, x7;                     \
-        mul     x7, x3, x14;                    \
-        umulh   x14, x3, x14;                   \
-        adcs    x10, x10, x7;                   \
-        mul     x7, x3, x6;                     \
-        umulh   x6, x3, x6;                     \
-        adcs    x11, x11, x7;                   \
-        cset    x12, cs;                        \
-        adds    x11, x11, x14;                  \
-        adc     x12, x12, x6;                   \
-        cmn     x11, x11;                       \
-        bic     x11, x11, #0x8000000000000000;  \
-        adc     x2, x12, x12;                   \
-        mov     x3, #0x13;                      \
-        mul     x7, x3, x2;                     \
-        adds    x8, x8, x7;                     \
-        adcs    x9, x9, x4;                     \
-        adcs    x10, x10, x13;                  \
-        adc     x11, x11, xzr;                  \
-        stp     x8, x9, [P0];                   \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x26 __LF                     \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        cmn     x11, x11 __LF                      \
+        bic     x11, x11, #0x8000000000000000 __LF \
+        adc     x2, x12, x12 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        mul     x7, x3, x2 __LF                    \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
         stp     x10, x11, [P0+16]
 
 // Modular addition with double modulus 2 * p_25519 = 2^256 - 38.
@@ -364,41 +364,41 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(p0,p1,p2)                    \
-        ldp     x5, x6, [p1];                   \
-        ldp     x4, x3, [p2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [p1+16];                \
-        ldp     x4, x3, [p2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [p0];                   \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [p0] __LF                  \
         stp     x7, x8, [p0+16]
 
 // Combined z = c * x + y with reduction only < 2 * p_25519
@@ -407,51 +407,51 @@
 // high mul in the final part.
 
 #define cmadd_4(p0,p2,p3)                       \
-        ldp     x7, x8, [p2];                   \
-        ldp     x9, x10, [p2+16];               \
-        mul     x3, x1, x7;                     \
-        mul     x4, x1, x8;                     \
-        mul     x5, x1, x9;                     \
-        mul     x6, x1, x10;                    \
-        umulh   x7, x1, x7;                     \
-        umulh   x8, x1, x8;                     \
-        umulh   x9, x1, x9;                     \
-        umulh   x10, x1, x10;                   \
-        adds    x4, x4, x7;                     \
-        adcs    x5, x5, x8;                     \
-        adcs    x6, x6, x9;                     \
-        adc     x10, x10, xzr;                  \
-        ldp     x7, x8, [p3];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x7, x8, [p3+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        adc     x10, x10, xzr;                  \
-        cmn     x6, x6;                         \
-        bic     x6, x6, #0x8000000000000000;    \
-        adc     x8, x10, x10;                   \
-        mov     x9, #19;                        \
-        mul     x7, x8, x9;                     \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [p0];                   \
+        ldp     x7, x8, [p2] __LF                  \
+        ldp     x9, x10, [p2+16] __LF              \
+        mul     x3, x1, x7 __LF                    \
+        mul     x4, x1, x8 __LF                    \
+        mul     x5, x1, x9 __LF                    \
+        mul     x6, x1, x10 __LF                   \
+        umulh   x7, x1, x7 __LF                    \
+        umulh   x8, x1, x8 __LF                    \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        adds    x4, x4, x7 __LF                    \
+        adcs    x5, x5, x8 __LF                    \
+        adcs    x6, x6, x9 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        ldp     x7, x8, [p3] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x7, x8, [p3+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x6, x6 __LF                        \
+        bic     x6, x6, #0x8000000000000000 __LF   \
+        adc     x8, x10, x10 __LF                  \
+        mov     x9, #19 __LF                       \
+        mul     x7, x8, x9 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [p0] __LF                  \
         stp     x5, x6, [p0+16]
 
 // Multiplex: z := if NZ then x else y
 
 #define mux_4(p0,p1,p2)                         \
-        ldp     x0, x1, [p1];                   \
-        ldp     x2, x3, [p2];                   \
-        csel    x0, x0, x2, ne;                 \
-        csel    x1, x1, x3, ne;                 \
-        stp     x0, x1, [p0];                   \
-        ldp     x0, x1, [p1+16];                \
-        ldp     x2, x3, [p2+16];                \
-        csel    x0, x0, x2, ne;                 \
-        csel    x1, x1, x3, ne;                 \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x2, x3, [p2] __LF                  \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0] __LF                  \
+        ldp     x0, x1, [p1+16] __LF               \
+        ldp     x2, x3, [p2+16] __LF               \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
         stp     x0, x1, [p0+16]
 
 S2N_BN_SYMBOL(curve25519_x25519_alt):
diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte.S
similarity index 100%
rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte.S
diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte_alt.S
similarity index 72%
rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte_alt.S
index 511e2960bd3..fc71df70903 100644
--- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte_alt.S
@@ -79,204 +79,204 @@
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        orr     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        madd    x11, x7, x8, x7;                \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adcs    x15, x15, xzr;                  \
-        csel    x7, x7, xzr, cc;                \
-        subs    x12, x12, x7;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbc     x15, x15, xzr;                  \
-        and     x15, x15, #0x7fffffffffffffff;  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        bic     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        mul     x11, x7, x8;                    \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adc     x15, x15, xzr;                  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // Squaring just giving a result < 2 * p_25519, which is done by
@@ -284,77 +284,77 @@
 // optional correction.
 
 #define sqr_4(P0,P1)                            \
-        ldp     x2, x3, [P1];                   \
-        mul     x9, x2, x3;                     \
-        umulh   x10, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x11, x2, x5;                    \
-        umulh   x12, x2, x5;                    \
-        mul     x7, x2, x4;                     \
-        umulh   x6, x2, x4;                     \
-        adds    x10, x10, x7;                   \
-        adcs    x11, x11, x6;                   \
-        mul     x7, x3, x4;                     \
-        umulh   x6, x3, x4;                     \
-        adc     x6, x6, xzr;                    \
-        adds    x11, x11, x7;                   \
-        mul     x13, x4, x5;                    \
-        umulh   x14, x4, x5;                    \
-        adcs    x12, x12, x6;                   \
-        mul     x7, x3, x5;                     \
-        umulh   x6, x3, x5;                     \
-        adc     x6, x6, xzr;                    \
-        adds    x12, x12, x7;                   \
-        adcs    x13, x13, x6;                   \
-        adc     x14, x14, xzr;                  \
-        adds    x9, x9, x9;                     \
-        adcs    x10, x10, x10;                  \
-        adcs    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        cset    x6, cs;                         \
-        umulh   x7, x2, x2;                     \
-        mul     x8, x2, x2;                     \
-        adds    x9, x9, x7;                     \
-        mul     x7, x3, x3;                     \
-        adcs    x10, x10, x7;                   \
-        umulh   x7, x3, x3;                     \
-        adcs    x11, x11, x7;                   \
-        mul     x7, x4, x4;                     \
-        adcs    x12, x12, x7;                   \
-        umulh   x7, x4, x4;                     \
-        adcs    x13, x13, x7;                   \
-        mul     x7, x5, x5;                     \
-        adcs    x14, x14, x7;                   \
-        umulh   x7, x5, x5;                     \
-        adc     x6, x6, x7;                     \
-        mov     x3, #0x26;                      \
-        mul     x7, x3, x12;                    \
-        umulh   x4, x3, x12;                    \
-        adds    x8, x8, x7;                     \
-        mul     x7, x3, x13;                    \
-        umulh   x13, x3, x13;                   \
-        adcs    x9, x9, x7;                     \
-        mul     x7, x3, x14;                    \
-        umulh   x14, x3, x14;                   \
-        adcs    x10, x10, x7;                   \
-        mul     x7, x3, x6;                     \
-        umulh   x6, x3, x6;                     \
-        adcs    x11, x11, x7;                   \
-        cset    x12, cs;                        \
-        adds    x11, x11, x14;                  \
-        adc     x12, x12, x6;                   \
-        cmn     x11, x11;                       \
-        bic     x11, x11, #0x8000000000000000;  \
-        adc     x2, x12, x12;                   \
-        mov     x3, #0x13;                      \
-        mul     x7, x3, x2;                     \
-        adds    x8, x8, x7;                     \
-        adcs    x9, x9, x4;                     \
-        adcs    x10, x10, x13;                  \
-        adc     x11, x11, xzr;                  \
-        stp     x8, x9, [P0];                   \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x26 __LF                     \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        cmn     x11, x11 __LF                      \
+        bic     x11, x11, #0x8000000000000000 __LF \
+        adc     x2, x12, x12 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        mul     x7, x3, x2 __LF                    \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
         stp     x10, x11, [P0+16]
 
 // Modular addition with double modulus 2 * p_25519 = 2^256 - 38.
@@ -364,41 +364,41 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(p0,p1,p2)                    \
-        ldp     x5, x6, [p1];                   \
-        ldp     x4, x3, [p2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [p1+16];                \
-        ldp     x4, x3, [p2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [p0];                   \
+        ldp     x5, x6, [p1] __LF                  \
+        ldp     x4, x3, [p2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [p1+16] __LF               \
+        ldp     x4, x3, [p2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [p0] __LF                  \
         stp     x7, x8, [p0+16]
 
 // Combined z = c * x + y with reduction only < 2 * p_25519
@@ -407,51 +407,51 @@
 // high mul in the final part.
 
 #define cmadd_4(p0,p2,p3)                       \
-        ldp     x7, x8, [p2];                   \
-        ldp     x9, x10, [p2+16];               \
-        mul     x3, x1, x7;                     \
-        mul     x4, x1, x8;                     \
-        mul     x5, x1, x9;                     \
-        mul     x6, x1, x10;                    \
-        umulh   x7, x1, x7;                     \
-        umulh   x8, x1, x8;                     \
-        umulh   x9, x1, x9;                     \
-        umulh   x10, x1, x10;                   \
-        adds    x4, x4, x7;                     \
-        adcs    x5, x5, x8;                     \
-        adcs    x6, x6, x9;                     \
-        adc     x10, x10, xzr;                  \
-        ldp     x7, x8, [p3];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x7, x8, [p3+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        adc     x10, x10, xzr;                  \
-        cmn     x6, x6;                         \
-        bic     x6, x6, #0x8000000000000000;    \
-        adc     x8, x10, x10;                   \
-        mov     x9, #19;                        \
-        mul     x7, x8, x9;                     \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [p0];                   \
+        ldp     x7, x8, [p2] __LF                  \
+        ldp     x9, x10, [p2+16] __LF              \
+        mul     x3, x1, x7 __LF                    \
+        mul     x4, x1, x8 __LF                    \
+        mul     x5, x1, x9 __LF                    \
+        mul     x6, x1, x10 __LF                   \
+        umulh   x7, x1, x7 __LF                    \
+        umulh   x8, x1, x8 __LF                    \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        adds    x4, x4, x7 __LF                    \
+        adcs    x5, x5, x8 __LF                    \
+        adcs    x6, x6, x9 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        ldp     x7, x8, [p3] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x7, x8, [p3+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x6, x6 __LF                        \
+        bic     x6, x6, #0x8000000000000000 __LF   \
+        adc     x8, x10, x10 __LF                  \
+        mov     x9, #19 __LF                       \
+        mul     x7, x8, x9 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [p0] __LF                  \
         stp     x5, x6, [p0+16]
 
 // Multiplex: z := if NZ then x else y
 
 #define mux_4(p0,p1,p2)                         \
-        ldp     x0, x1, [p1];                   \
-        ldp     x2, x3, [p2];                   \
-        csel    x0, x0, x2, ne;                 \
-        csel    x1, x1, x3, ne;                 \
-        stp     x0, x1, [p0];                   \
-        ldp     x0, x1, [p1+16];                \
-        ldp     x2, x3, [p2+16];                \
-        csel    x0, x0, x2, ne;                 \
-        csel    x1, x1, x3, ne;                 \
+        ldp     x0, x1, [p1] __LF                  \
+        ldp     x2, x3, [p2] __LF                  \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
+        stp     x0, x1, [p0] __LF                  \
+        ldp     x0, x1, [p1+16] __LF               \
+        ldp     x2, x3, [p2+16] __LF               \
+        csel    x0, x0, x2, ne __LF                \
+        csel    x1, x1, x3, ne __LF                \
         stp     x0, x1, [p0+16]
 
 S2N_BN_SYMBOL(curve25519_x25519_byte_alt):
diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base.S
similarity index 92%
rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519base.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base.S
index ef46f7b169e..748032a8ece 100644
--- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base.S
@@ -78,382 +78,382 @@
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        umull   x7, w3, w5;                     \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x8, w16, w0;                    \
-        umull   x16, w3, w16;                   \
-        adds    x7, x7, x15, lsl #32;           \
-        lsr     x15, x15, #32;                  \
-        adc     x8, x8, x15;                    \
-        adds    x7, x7, x16, lsl #32;           \
-        lsr     x16, x16, #32;                  \
-        adc     x8, x8, x16;                    \
-        mul     x9, x4, x6;                     \
-        umulh   x10, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x9, x9, x8;                     \
-        adc     x10, x10, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x8, x7, x9;                     \
-        adcs    x9, x9, x10;                    \
-        adc     x10, x10, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x8, x15, x8;                    \
-        eor     x3, x3, x16;                    \
-        adcs    x9, x3, x9;                     \
-        adc     x10, x10, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P2+16];                \
-        umull   x11, w3, w5;                    \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x12, w16, w0;                   \
-        umull   x16, w3, w16;                   \
-        adds    x11, x11, x15, lsl #32;         \
-        lsr     x15, x15, #32;                  \
-        adc     x12, x12, x15;                  \
-        adds    x11, x11, x16, lsl #32;         \
-        lsr     x16, x16, #32;                  \
-        adc     x12, x12, x16;                  \
-        mul     x13, x4, x6;                    \
-        umulh   x14, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x13, x13, x12;                  \
-        adc     x14, x14, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x12, x11, x13;                  \
-        adcs    x13, x13, x14;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x12, x15, x12;                  \
-        eor     x3, x3, x16;                    \
-        adcs    x13, x3, x13;                   \
-        adc     x14, x14, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x15, x16, [P1];                 \
-        subs    x3, x3, x15;                    \
-        sbcs    x4, x4, x16;                    \
-        csetm   x16, cc;                        \
-        ldp     x15, x0, [P2];                  \
-        subs    x5, x15, x5;                    \
-        sbcs    x6, x0, x6;                     \
-        csetm   x0, cc;                         \
-        eor     x3, x3, x16;                    \
-        subs    x3, x3, x16;                    \
-        eor     x4, x4, x16;                    \
-        sbc     x4, x4, x16;                    \
-        eor     x5, x5, x0;                     \
-        subs    x5, x5, x0;                     \
-        eor     x6, x6, x0;                     \
-        sbc     x6, x6, x0;                     \
-        eor     x16, x0, x16;                   \
-        adds    x11, x11, x9;                   \
-        adcs    x12, x12, x10;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        mul     x2, x3, x5;                     \
-        umulh   x0, x3, x5;                     \
-        mul     x15, x4, x6;                    \
-        umulh   x1, x4, x6;                     \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x9, cc;                         \
-        adds    x15, x15, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x6, x5, x6;                     \
-        cneg    x6, x6, cc;                     \
-        cinv    x9, x9, cc;                     \
-        mul     x5, x4, x6;                     \
-        umulh   x6, x4, x6;                     \
-        adds    x0, x2, x15;                    \
-        adcs    x15, x15, x1;                   \
-        adc     x1, x1, xzr;                    \
-        cmn     x9, #0x1;                       \
-        eor     x5, x5, x9;                     \
-        adcs    x0, x5, x0;                     \
-        eor     x6, x6, x9;                     \
-        adcs    x15, x6, x15;                   \
-        adc     x1, x1, x9;                     \
-        adds    x9, x11, x7;                    \
-        adcs    x10, x12, x8;                   \
-        adcs    x11, x13, x11;                  \
-        adcs    x12, x14, x12;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x2, x2, x16;                    \
-        adcs    x9, x2, x9;                     \
-        eor     x0, x0, x16;                    \
-        adcs    x10, x0, x10;                   \
-        eor     x15, x15, x16;                  \
-        adcs    x11, x15, x11;                  \
-        eor     x1, x1, x16;                    \
-        adcs    x12, x1, x12;                   \
-        adcs    x13, x13, x16;                  \
-        adc     x14, x14, x16;                  \
-        mov     x3, #0x26;                      \
-        umull   x4, w11, w3;                    \
-        add     x4, x4, w7, uxtw;               \
-        lsr     x7, x7, #32;                    \
-        lsr     x11, x11, #32;                  \
-        umaddl  x11, w11, w3, x7;               \
-        mov     x7, x4;                         \
-        umull   x4, w12, w3;                    \
-        add     x4, x4, w8, uxtw;               \
-        lsr     x8, x8, #32;                    \
-        lsr     x12, x12, #32;                  \
-        umaddl  x12, w12, w3, x8;               \
-        mov     x8, x4;                         \
-        umull   x4, w13, w3;                    \
-        add     x4, x4, w9, uxtw;               \
-        lsr     x9, x9, #32;                    \
-        lsr     x13, x13, #32;                  \
-        umaddl  x13, w13, w3, x9;               \
-        mov     x9, x4;                         \
-        umull   x4, w14, w3;                    \
-        add     x4, x4, w10, uxtw;              \
-        lsr     x10, x10, #32;                  \
-        lsr     x14, x14, #32;                  \
-        umaddl  x14, w14, w3, x10;              \
-        mov     x10, x4;                        \
-        lsr     x0, x14, #31;                   \
-        mov     x5, #0x13;                      \
-        umaddl  x5, w5, w0, x5;                 \
-        add     x7, x7, x5;                     \
-        adds    x7, x7, x11, lsl #32;           \
-        extr    x3, x12, x11, #32;              \
-        adcs    x8, x8, x3;                     \
-        extr    x3, x13, x12, #32;              \
-        adcs    x9, x9, x3;                     \
-        extr    x3, x14, x13, #32;              \
-        lsl     x5, x0, #63;                    \
-        eor     x10, x10, x5;                   \
-        adc     x10, x10, x3;                   \
-        mov     x3, #0x13;                      \
-        tst     x10, #0x8000000000000000;       \
-        csel    x3, x3, xzr, pl;                \
-        subs    x7, x7, x3;                     \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        and     x10, x10, #0x7fffffffffffffff;  \
-        stp     x7, x8, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
         stp     x9, x10, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        umull   x7, w3, w5;                     \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x8, w16, w0;                    \
-        umull   x16, w3, w16;                   \
-        adds    x7, x7, x15, lsl #32;           \
-        lsr     x15, x15, #32;                  \
-        adc     x8, x8, x15;                    \
-        adds    x7, x7, x16, lsl #32;           \
-        lsr     x16, x16, #32;                  \
-        adc     x8, x8, x16;                    \
-        mul     x9, x4, x6;                     \
-        umulh   x10, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x9, x9, x8;                     \
-        adc     x10, x10, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x8, x7, x9;                     \
-        adcs    x9, x9, x10;                    \
-        adc     x10, x10, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x8, x15, x8;                    \
-        eor     x3, x3, x16;                    \
-        adcs    x9, x3, x9;                     \
-        adc     x10, x10, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P2+16];                \
-        umull   x11, w3, w5;                    \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x12, w16, w0;                   \
-        umull   x16, w3, w16;                   \
-        adds    x11, x11, x15, lsl #32;         \
-        lsr     x15, x15, #32;                  \
-        adc     x12, x12, x15;                  \
-        adds    x11, x11, x16, lsl #32;         \
-        lsr     x16, x16, #32;                  \
-        adc     x12, x12, x16;                  \
-        mul     x13, x4, x6;                    \
-        umulh   x14, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x13, x13, x12;                  \
-        adc     x14, x14, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x12, x11, x13;                  \
-        adcs    x13, x13, x14;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x12, x15, x12;                  \
-        eor     x3, x3, x16;                    \
-        adcs    x13, x3, x13;                   \
-        adc     x14, x14, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x15, x16, [P1];                 \
-        subs    x3, x3, x15;                    \
-        sbcs    x4, x4, x16;                    \
-        csetm   x16, cc;                        \
-        ldp     x15, x0, [P2];                  \
-        subs    x5, x15, x5;                    \
-        sbcs    x6, x0, x6;                     \
-        csetm   x0, cc;                         \
-        eor     x3, x3, x16;                    \
-        subs    x3, x3, x16;                    \
-        eor     x4, x4, x16;                    \
-        sbc     x4, x4, x16;                    \
-        eor     x5, x5, x0;                     \
-        subs    x5, x5, x0;                     \
-        eor     x6, x6, x0;                     \
-        sbc     x6, x6, x0;                     \
-        eor     x16, x0, x16;                   \
-        adds    x11, x11, x9;                   \
-        adcs    x12, x12, x10;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        mul     x2, x3, x5;                     \
-        umulh   x0, x3, x5;                     \
-        mul     x15, x4, x6;                    \
-        umulh   x1, x4, x6;                     \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x9, cc;                         \
-        adds    x15, x15, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x6, x5, x6;                     \
-        cneg    x6, x6, cc;                     \
-        cinv    x9, x9, cc;                     \
-        mul     x5, x4, x6;                     \
-        umulh   x6, x4, x6;                     \
-        adds    x0, x2, x15;                    \
-        adcs    x15, x15, x1;                   \
-        adc     x1, x1, xzr;                    \
-        cmn     x9, #0x1;                       \
-        eor     x5, x5, x9;                     \
-        adcs    x0, x5, x0;                     \
-        eor     x6, x6, x9;                     \
-        adcs    x15, x6, x15;                   \
-        adc     x1, x1, x9;                     \
-        adds    x9, x11, x7;                    \
-        adcs    x10, x12, x8;                   \
-        adcs    x11, x13, x11;                  \
-        adcs    x12, x14, x12;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x2, x2, x16;                    \
-        adcs    x9, x2, x9;                     \
-        eor     x0, x0, x16;                    \
-        adcs    x10, x0, x10;                   \
-        eor     x15, x15, x16;                  \
-        adcs    x11, x15, x11;                  \
-        eor     x1, x1, x16;                    \
-        adcs    x12, x1, x12;                   \
-        adcs    x13, x13, x16;                  \
-        adc     x14, x14, x16;                  \
-        mov     x3, #0x26;                      \
-        umull   x4, w11, w3;                    \
-        add     x4, x4, w7, uxtw;               \
-        lsr     x7, x7, #32;                    \
-        lsr     x11, x11, #32;                  \
-        umaddl  x11, w11, w3, x7;               \
-        mov     x7, x4;                         \
-        umull   x4, w12, w3;                    \
-        add     x4, x4, w8, uxtw;               \
-        lsr     x8, x8, #32;                    \
-        lsr     x12, x12, #32;                  \
-        umaddl  x12, w12, w3, x8;               \
-        mov     x8, x4;                         \
-        umull   x4, w13, w3;                    \
-        add     x4, x4, w9, uxtw;               \
-        lsr     x9, x9, #32;                    \
-        lsr     x13, x13, #32;                  \
-        umaddl  x13, w13, w3, x9;               \
-        mov     x9, x4;                         \
-        umull   x4, w14, w3;                    \
-        add     x4, x4, w10, uxtw;              \
-        lsr     x10, x10, #32;                  \
-        lsr     x14, x14, #32;                  \
-        umaddl  x14, w14, w3, x10;              \
-        mov     x10, x4;                        \
-        lsr     x0, x14, #31;                   \
-        mov     x5, #0x13;                      \
-        umull   x5, w5, w0;                     \
-        add     x7, x7, x5;                     \
-        adds    x7, x7, x11, lsl #32;           \
-        extr    x3, x12, x11, #32;              \
-        adcs    x8, x8, x3;                     \
-        extr    x3, x13, x12, #32;              \
-        adcs    x9, x9, x3;                     \
-        extr    x3, x14, x13, #32;              \
-        lsl     x5, x0, #63;                    \
-        eor     x10, x10, x5;                   \
-        adc     x10, x10, x3;                   \
-        stp     x7, x8, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umull   x5, w5, w0 __LF                    \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        stp     x7, x8, [P0] __LF                  \
         stp     x9, x10, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(P0,P1,P2)                    \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [P0];                   \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
         stp     x7, x8, [P0+16]
 
 // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
@@ -463,37 +463,37 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 #define double_twice4(P0,P1)                    \
-        ldp     x3, x4, [P1];                   \
-        adds    x3, x3, x3;                     \
-        adcs    x4, x4, x4;                     \
-        ldp     x5, x6, [P1+16];                \
-        adcs    x5, x5, x5;                     \
-        adcs    x6, x6, x6;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 S2N_BN_SYMBOL(curve25519_x25519base):
diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_alt.S
similarity index 95%
rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_alt.S
index 702fe6e88aa..e834548f91c 100644
--- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_alt.S
@@ -78,224 +78,224 @@
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        orr     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        madd    x11, x7, x8, x7;                \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adcs    x15, x15, xzr;                  \
-        csel    x7, x7, xzr, cc;                \
-        subs    x12, x12, x7;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbc     x15, x15, xzr;                  \
-        and     x15, x15, #0x7fffffffffffffff;  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        bic     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        mul     x11, x7, x8;                    \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adc     x15, x15, xzr;                  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(P0,P1,P2)                    \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [P0];                   \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
         stp     x7, x8, [P0+16]
 
 // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
@@ -305,37 +305,37 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 #define double_twice4(P0,P1)                    \
-        ldp     x3, x4, [P1];                   \
-        adds    x3, x3, x3;                     \
-        adcs    x4, x4, x4;                     \
-        ldp     x5, x6, [P1+16];                \
-        adcs    x5, x5, x5;                     \
-        adcs    x6, x6, x6;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 S2N_BN_SYMBOL(curve25519_x25519base_alt):
diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte.S
similarity index 93%
rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte.S
index 635729cb77a..82eb0986151 100644
--- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte.S
@@ -78,382 +78,382 @@
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        umull   x7, w3, w5;                     \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x8, w16, w0;                    \
-        umull   x16, w3, w16;                   \
-        adds    x7, x7, x15, lsl #32;           \
-        lsr     x15, x15, #32;                  \
-        adc     x8, x8, x15;                    \
-        adds    x7, x7, x16, lsl #32;           \
-        lsr     x16, x16, #32;                  \
-        adc     x8, x8, x16;                    \
-        mul     x9, x4, x6;                     \
-        umulh   x10, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x9, x9, x8;                     \
-        adc     x10, x10, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x8, x7, x9;                     \
-        adcs    x9, x9, x10;                    \
-        adc     x10, x10, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x8, x15, x8;                    \
-        eor     x3, x3, x16;                    \
-        adcs    x9, x3, x9;                     \
-        adc     x10, x10, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P2+16];                \
-        umull   x11, w3, w5;                    \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x12, w16, w0;                   \
-        umull   x16, w3, w16;                   \
-        adds    x11, x11, x15, lsl #32;         \
-        lsr     x15, x15, #32;                  \
-        adc     x12, x12, x15;                  \
-        adds    x11, x11, x16, lsl #32;         \
-        lsr     x16, x16, #32;                  \
-        adc     x12, x12, x16;                  \
-        mul     x13, x4, x6;                    \
-        umulh   x14, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x13, x13, x12;                  \
-        adc     x14, x14, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x12, x11, x13;                  \
-        adcs    x13, x13, x14;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x12, x15, x12;                  \
-        eor     x3, x3, x16;                    \
-        adcs    x13, x3, x13;                   \
-        adc     x14, x14, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x15, x16, [P1];                 \
-        subs    x3, x3, x15;                    \
-        sbcs    x4, x4, x16;                    \
-        csetm   x16, cc;                        \
-        ldp     x15, x0, [P2];                  \
-        subs    x5, x15, x5;                    \
-        sbcs    x6, x0, x6;                     \
-        csetm   x0, cc;                         \
-        eor     x3, x3, x16;                    \
-        subs    x3, x3, x16;                    \
-        eor     x4, x4, x16;                    \
-        sbc     x4, x4, x16;                    \
-        eor     x5, x5, x0;                     \
-        subs    x5, x5, x0;                     \
-        eor     x6, x6, x0;                     \
-        sbc     x6, x6, x0;                     \
-        eor     x16, x0, x16;                   \
-        adds    x11, x11, x9;                   \
-        adcs    x12, x12, x10;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        mul     x2, x3, x5;                     \
-        umulh   x0, x3, x5;                     \
-        mul     x15, x4, x6;                    \
-        umulh   x1, x4, x6;                     \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x9, cc;                         \
-        adds    x15, x15, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x6, x5, x6;                     \
-        cneg    x6, x6, cc;                     \
-        cinv    x9, x9, cc;                     \
-        mul     x5, x4, x6;                     \
-        umulh   x6, x4, x6;                     \
-        adds    x0, x2, x15;                    \
-        adcs    x15, x15, x1;                   \
-        adc     x1, x1, xzr;                    \
-        cmn     x9, #0x1;                       \
-        eor     x5, x5, x9;                     \
-        adcs    x0, x5, x0;                     \
-        eor     x6, x6, x9;                     \
-        adcs    x15, x6, x15;                   \
-        adc     x1, x1, x9;                     \
-        adds    x9, x11, x7;                    \
-        adcs    x10, x12, x8;                   \
-        adcs    x11, x13, x11;                  \
-        adcs    x12, x14, x12;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x2, x2, x16;                    \
-        adcs    x9, x2, x9;                     \
-        eor     x0, x0, x16;                    \
-        adcs    x10, x0, x10;                   \
-        eor     x15, x15, x16;                  \
-        adcs    x11, x15, x11;                  \
-        eor     x1, x1, x16;                    \
-        adcs    x12, x1, x12;                   \
-        adcs    x13, x13, x16;                  \
-        adc     x14, x14, x16;                  \
-        mov     x3, #0x26;                      \
-        umull   x4, w11, w3;                    \
-        add     x4, x4, w7, uxtw;               \
-        lsr     x7, x7, #32;                    \
-        lsr     x11, x11, #32;                  \
-        umaddl  x11, w11, w3, x7;               \
-        mov     x7, x4;                         \
-        umull   x4, w12, w3;                    \
-        add     x4, x4, w8, uxtw;               \
-        lsr     x8, x8, #32;                    \
-        lsr     x12, x12, #32;                  \
-        umaddl  x12, w12, w3, x8;               \
-        mov     x8, x4;                         \
-        umull   x4, w13, w3;                    \
-        add     x4, x4, w9, uxtw;               \
-        lsr     x9, x9, #32;                    \
-        lsr     x13, x13, #32;                  \
-        umaddl  x13, w13, w3, x9;               \
-        mov     x9, x4;                         \
-        umull   x4, w14, w3;                    \
-        add     x4, x4, w10, uxtw;              \
-        lsr     x10, x10, #32;                  \
-        lsr     x14, x14, #32;                  \
-        umaddl  x14, w14, w3, x10;              \
-        mov     x10, x4;                        \
-        lsr     x0, x14, #31;                   \
-        mov     x5, #0x13;                      \
-        umaddl  x5, w5, w0, x5;                 \
-        add     x7, x7, x5;                     \
-        adds    x7, x7, x11, lsl #32;           \
-        extr    x3, x12, x11, #32;              \
-        adcs    x8, x8, x3;                     \
-        extr    x3, x13, x12, #32;              \
-        adcs    x9, x9, x3;                     \
-        extr    x3, x14, x13, #32;              \
-        lsl     x5, x0, #63;                    \
-        eor     x10, x10, x5;                   \
-        adc     x10, x10, x3;                   \
-        mov     x3, #0x13;                      \
-        tst     x10, #0x8000000000000000;       \
-        csel    x3, x3, xzr, pl;                \
-        subs    x7, x7, x3;                     \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        and     x10, x10, #0x7fffffffffffffff;  \
-        stp     x7, x8, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
         stp     x9, x10, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        umull   x7, w3, w5;                     \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x8, w16, w0;                    \
-        umull   x16, w3, w16;                   \
-        adds    x7, x7, x15, lsl #32;           \
-        lsr     x15, x15, #32;                  \
-        adc     x8, x8, x15;                    \
-        adds    x7, x7, x16, lsl #32;           \
-        lsr     x16, x16, #32;                  \
-        adc     x8, x8, x16;                    \
-        mul     x9, x4, x6;                     \
-        umulh   x10, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x9, x9, x8;                     \
-        adc     x10, x10, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x8, x7, x9;                     \
-        adcs    x9, x9, x10;                    \
-        adc     x10, x10, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x8, x15, x8;                    \
-        eor     x3, x3, x16;                    \
-        adcs    x9, x3, x9;                     \
-        adc     x10, x10, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P2+16];                \
-        umull   x11, w3, w5;                    \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x12, w16, w0;                   \
-        umull   x16, w3, w16;                   \
-        adds    x11, x11, x15, lsl #32;         \
-        lsr     x15, x15, #32;                  \
-        adc     x12, x12, x15;                  \
-        adds    x11, x11, x16, lsl #32;         \
-        lsr     x16, x16, #32;                  \
-        adc     x12, x12, x16;                  \
-        mul     x13, x4, x6;                    \
-        umulh   x14, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x13, x13, x12;                  \
-        adc     x14, x14, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x12, x11, x13;                  \
-        adcs    x13, x13, x14;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x12, x15, x12;                  \
-        eor     x3, x3, x16;                    \
-        adcs    x13, x3, x13;                   \
-        adc     x14, x14, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x15, x16, [P1];                 \
-        subs    x3, x3, x15;                    \
-        sbcs    x4, x4, x16;                    \
-        csetm   x16, cc;                        \
-        ldp     x15, x0, [P2];                  \
-        subs    x5, x15, x5;                    \
-        sbcs    x6, x0, x6;                     \
-        csetm   x0, cc;                         \
-        eor     x3, x3, x16;                    \
-        subs    x3, x3, x16;                    \
-        eor     x4, x4, x16;                    \
-        sbc     x4, x4, x16;                    \
-        eor     x5, x5, x0;                     \
-        subs    x5, x5, x0;                     \
-        eor     x6, x6, x0;                     \
-        sbc     x6, x6, x0;                     \
-        eor     x16, x0, x16;                   \
-        adds    x11, x11, x9;                   \
-        adcs    x12, x12, x10;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        mul     x2, x3, x5;                     \
-        umulh   x0, x3, x5;                     \
-        mul     x15, x4, x6;                    \
-        umulh   x1, x4, x6;                     \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x9, cc;                         \
-        adds    x15, x15, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x6, x5, x6;                     \
-        cneg    x6, x6, cc;                     \
-        cinv    x9, x9, cc;                     \
-        mul     x5, x4, x6;                     \
-        umulh   x6, x4, x6;                     \
-        adds    x0, x2, x15;                    \
-        adcs    x15, x15, x1;                   \
-        adc     x1, x1, xzr;                    \
-        cmn     x9, #0x1;                       \
-        eor     x5, x5, x9;                     \
-        adcs    x0, x5, x0;                     \
-        eor     x6, x6, x9;                     \
-        adcs    x15, x6, x15;                   \
-        adc     x1, x1, x9;                     \
-        adds    x9, x11, x7;                    \
-        adcs    x10, x12, x8;                   \
-        adcs    x11, x13, x11;                  \
-        adcs    x12, x14, x12;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x2, x2, x16;                    \
-        adcs    x9, x2, x9;                     \
-        eor     x0, x0, x16;                    \
-        adcs    x10, x0, x10;                   \
-        eor     x15, x15, x16;                  \
-        adcs    x11, x15, x11;                  \
-        eor     x1, x1, x16;                    \
-        adcs    x12, x1, x12;                   \
-        adcs    x13, x13, x16;                  \
-        adc     x14, x14, x16;                  \
-        mov     x3, #0x26;                      \
-        umull   x4, w11, w3;                    \
-        add     x4, x4, w7, uxtw;               \
-        lsr     x7, x7, #32;                    \
-        lsr     x11, x11, #32;                  \
-        umaddl  x11, w11, w3, x7;               \
-        mov     x7, x4;                         \
-        umull   x4, w12, w3;                    \
-        add     x4, x4, w8, uxtw;               \
-        lsr     x8, x8, #32;                    \
-        lsr     x12, x12, #32;                  \
-        umaddl  x12, w12, w3, x8;               \
-        mov     x8, x4;                         \
-        umull   x4, w13, w3;                    \
-        add     x4, x4, w9, uxtw;               \
-        lsr     x9, x9, #32;                    \
-        lsr     x13, x13, #32;                  \
-        umaddl  x13, w13, w3, x9;               \
-        mov     x9, x4;                         \
-        umull   x4, w14, w3;                    \
-        add     x4, x4, w10, uxtw;              \
-        lsr     x10, x10, #32;                  \
-        lsr     x14, x14, #32;                  \
-        umaddl  x14, w14, w3, x10;              \
-        mov     x10, x4;                        \
-        lsr     x0, x14, #31;                   \
-        mov     x5, #0x13;                      \
-        umull   x5, w5, w0;                     \
-        add     x7, x7, x5;                     \
-        adds    x7, x7, x11, lsl #32;           \
-        extr    x3, x12, x11, #32;              \
-        adcs    x8, x8, x3;                     \
-        extr    x3, x13, x12, #32;              \
-        adcs    x9, x9, x3;                     \
-        extr    x3, x14, x13, #32;              \
-        lsl     x5, x0, #63;                    \
-        eor     x10, x10, x5;                   \
-        adc     x10, x10, x3;                   \
-        stp     x7, x8, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umull   x5, w5, w0 __LF                    \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        stp     x7, x8, [P0] __LF                  \
         stp     x9, x10, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(P0,P1,P2)                    \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [P0];                   \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
         stp     x7, x8, [P0+16]
 
 // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
@@ -463,37 +463,37 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 #define double_twice4(P0,P1)                    \
-        ldp     x3, x4, [P1];                   \
-        adds    x3, x3, x3;                     \
-        adcs    x4, x4, x4;                     \
-        ldp     x5, x6, [P1+16];                \
-        adcs    x5, x5, x5;                     \
-        adcs    x6, x6, x6;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 S2N_BN_SYMBOL(curve25519_x25519base_byte):
diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte_alt.S
similarity index 95%
rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte_alt.S
index 39b6bfd1724..b3062b6837d 100644
--- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte_alt.S
@@ -78,224 +78,224 @@
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        orr     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        madd    x11, x7, x8, x7;                \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adcs    x15, x15, xzr;                  \
-        csel    x7, x7, xzr, cc;                \
-        subs    x12, x12, x7;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbc     x15, x15, xzr;                  \
-        and     x15, x15, #0x7fffffffffffffff;  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        bic     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        mul     x11, x7, x8;                    \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adc     x15, x15, xzr;                  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(P0,P1,P2)                    \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [P0];                   \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
         stp     x7, x8, [P0+16]
 
 // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
@@ -305,37 +305,37 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 #define double_twice4(P0,P1)                    \
-        ldp     x3, x4, [P1];                   \
-        adds    x3, x3, x3;                     \
-        adcs    x4, x4, x4;                     \
-        ldp     x5, x6, [P1+16];                \
-        adcs    x5, x5, x5;                     \
-        adcs    x6, x6, x6;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 S2N_BN_SYMBOL(curve25519_x25519base_byte_alt):
diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_decode.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode.S
similarity index 96%
rename from third_party/s2n-bignum/arm/curve25519/edwards25519_decode.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode.S
index f565df90fd1..715662b1c9e 100644
--- a/third_party/s2n-bignum/arm/curve25519/edwards25519_decode.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode.S
@@ -59,23 +59,23 @@
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 // Macros wrapping up calls to the local subroutines
 
 #define mulp(dest,src1,src2)                                            \
-        add     x0, dest;                                               \
-        add     x1, src1;                                               \
-        add     x2, src2;                                               \
+        add     x0, dest __LF                                              \
+        add     x1, src1 __LF                                              \
+        add     x2, src2 __LF                                              \
         bl      edwards25519_decode_mul_p25519
 
 #define nsqr(dest,n,src)                                                \
-        add     x0, dest;                                               \
-        mov     x1, n;                                                  \
-        add     x2, src;                                                \
+        add     x0, dest __LF                                              \
+        mov     x1, n __LF                                                 \
+        add     x2, src __LF                                               \
         bl      edwards25519_decode_nsqr_p25519
 
 S2N_BN_SYMBOL(edwards25519_decode):
diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_decode_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode_alt.S
similarity index 95%
rename from third_party/s2n-bignum/arm/curve25519/edwards25519_decode_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode_alt.S
index befacd2ff01..79743f73b03 100644
--- a/third_party/s2n-bignum/arm/curve25519/edwards25519_decode_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode_alt.S
@@ -59,23 +59,23 @@
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 // Macros wrapping up calls to the local subroutines
 
 #define mulp(dest,src1,src2)                                            \
-        add     x0, dest;                                               \
-        add     x1, src1;                                               \
-        add     x2, src2;                                               \
+        add     x0, dest __LF                                              \
+        add     x1, src1 __LF                                              \
+        add     x2, src2 __LF                                              \
         bl      edwards25519_decode_alt_mul_p25519
 
 #define nsqr(dest,n,src)                                                \
-        add     x0, dest;                                               \
-        mov     x1, n;                                                  \
-        add     x2, src;                                                \
+        add     x0, dest __LF                                              \
+        mov     x1, n __LF                                                 \
+        add     x2, src __LF                                               \
         bl      edwards25519_decode_alt_nsqr_p25519
 
 S2N_BN_SYMBOL(edwards25519_decode_alt):
diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_encode.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_encode.S
similarity index 100%
rename from third_party/s2n-bignum/arm/curve25519/edwards25519_encode.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_encode.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd.S
new file mode 100644
index 00000000000..ee94ffc3370
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd.S
@@ -0,0 +1,588 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective addition for edwards25519
+// Inputs p1[16], p2[16]; output p3[16]
+//
+// extern void edwards25519_epadd
+//   (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16])
+//
+// The output p3 and both inputs p1 and p2 are points (x,y) on
+// edwards25519 represented in extended projective quadruples (X,Y,Z,T)
+// where x = X / Z, y = Y / Z and x * y = T / Z.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epadd)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define p3 x17
+#define p1 x19
+#define p2 x20
+
+// Pointers to input and output coordinates
+
+#define x_1 p1, #0
+#define y_1 p1, #NUMSIZE
+#define z_1 p1, #(2*NUMSIZE)
+#define w_1 p1, #(3*NUMSIZE)
+
+#define x_2 p2, #0
+#define y_2 p2, #NUMSIZE
+#define z_2 p2, #(2*NUMSIZE)
+#define w_2 p2, #(3*NUMSIZE)
+
+#define x_3 p3, #0
+#define y_3 p3, #NUMSIZE
+#define z_3 p3, #(2*NUMSIZE)
+#define w_3 p3, #(3*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 sp, #(0*NUMSIZE)
+#define t1 sp, #(1*NUMSIZE)
+#define t2 sp, #(2*NUMSIZE)
+#define t3 sp, #(3*NUMSIZE)
+#define t4 sp, #(4*NUMSIZE)
+#define t5 sp, #(5*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (6*NUMSIZE)
+
+// Macro wrapping up the basic field operation bignum_mul_p25519, only
+// trivially different from a pure function call to that subroutine.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umull   x5, w5, w0 __LF                    \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// Plain 4-digit add and doubling without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        ldp     x4, x5, [P2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+#define double_4(P0,P1)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        adds    x0, x0, x0 __LF                    \
+        adcs    x1, x1, x1 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        adcs    x2, x2, x2 __LF                    \
+        adc     x3, x3, x3 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x3, #19 __LF                       \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        mov     x4, #0x8000000000000000 __LF       \
+        sbc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38
+// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// Load the constant k_25519 = 2 * d_25519 using immediate operations
+
+#define load_k25519(P0)                         \
+        movz    x0, #0xf159 __LF                   \
+        movz    x1, #0xb156 __LF                   \
+        movz    x2, #0xd130 __LF                   \
+        movz    x3, #0xfce7 __LF                   \
+        movk    x0, #0x26b2, lsl #16 __LF          \
+        movk    x1, #0x8283, lsl #16 __LF          \
+        movk    x2, #0xeef3, lsl #16 __LF          \
+        movk    x3, #0x56df, lsl #16 __LF          \
+        movk    x0, #0x9b94, lsl #32 __LF          \
+        movk    x1, #0x149a, lsl #32 __LF          \
+        movk    x2, #0x80f2, lsl #32 __LF          \
+        movk    x3, #0xd9dc, lsl #32 __LF          \
+        movk    x0, #0xebd6, lsl #48 __LF          \
+        movk    x1, #0x00e0, lsl #48 __LF          \
+        movk    x2, #0x198e, lsl #48 __LF          \
+        movk    x3, #0x2406, lsl #48 __LF          \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+S2N_BN_SYMBOL(edwards25519_epadd):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     p3, x0
+        mov     p1, x1
+        mov     p2, x2
+
+// Main sequence
+
+        mul_4(t0,w_1,w_2)
+
+        sub_4(t1,y_1,x_1)
+        sub_4(t2,y_2,x_2)
+        add_4(t3,y_1,x_1)
+        add_4(t4,y_2,x_2)
+        double_4(t5,z_2)
+
+        mul_4(t1,t1,t2)
+        mul_4(t3,t3,t4)
+
+        load_k25519(t2)
+        mul_4(t2,t2,t0)
+
+        mul_4(t4,z_1,t5)
+
+        sub_twice4(t0,t3,t1)
+        add_twice4(t5,t3,t1)
+        sub_twice4(t1,t4,t2)
+        add_twice4(t3,t4,t2)
+
+        mul_p25519(w_3,t0,t5)
+        mul_p25519(x_3,t0,t1)
+        mul_p25519(y_3,t3,t5)
+        mul_p25519(z_3,t1,t3)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd_alt.S
new file mode 100644
index 00000000000..4324c25245f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd_alt.S
@@ -0,0 +1,431 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective addition for edwards25519
+// Inputs p1[16], p2[16]; output p3[16]
+//
+// extern void edwards25519_epadd_alt
+//   (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16])
+//
+// The output p3 and both inputs p1 and p2 are points (x,y) on
+// edwards25519 represented in extended projective quadruples (X,Y,Z,T)
+// where x = X / Z, y = Y / Z and x * y = T / Z.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define p3 x17
+#define p1 x19
+#define p2 x20
+
+// Pointers to input and output coordinates
+
+#define x_1 p1, #0
+#define y_1 p1, #NUMSIZE
+#define z_1 p1, #(2*NUMSIZE)
+#define w_1 p1, #(3*NUMSIZE)
+
+#define x_2 p2, #0
+#define y_2 p2, #NUMSIZE
+#define z_2 p2, #(2*NUMSIZE)
+#define w_2 p2, #(3*NUMSIZE)
+
+#define x_3 p3, #0
+#define y_3 p3, #NUMSIZE
+#define z_3 p3, #(2*NUMSIZE)
+#define w_3 p3, #(3*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 sp, #(0*NUMSIZE)
+#define t1 sp, #(1*NUMSIZE)
+#define t2 sp, #(2*NUMSIZE)
+#define t3 sp, #(3*NUMSIZE)
+#define t4 sp, #(4*NUMSIZE)
+#define t5 sp, #(5*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (6*NUMSIZE)
+
+// Macro wrapping up the basic field operation bignum_mul_p25519_alt, only
+// trivially different from a pure function call to that subroutine.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+// Plain 4-digit add and doubling without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        ldp     x4, x5, [P2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+#define double_4(P0,P1)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        adds    x0, x0, x0 __LF                    \
+        adcs    x1, x1, x1 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        adcs    x2, x2, x2 __LF                    \
+        adc     x3, x3, x3 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x3, #19 __LF                       \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        mov     x4, #0x8000000000000000 __LF       \
+        sbc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38
+// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// Load the constant k_25519 = 2 * d_25519 using immediate operations
+
+#define load_k25519(P0)                         \
+        movz    x0, #0xf159 __LF                   \
+        movz    x1, #0xb156 __LF                   \
+        movz    x2, #0xd130 __LF                   \
+        movz    x3, #0xfce7 __LF                   \
+        movk    x0, #0x26b2, lsl #16 __LF          \
+        movk    x1, #0x8283, lsl #16 __LF          \
+        movk    x2, #0xeef3, lsl #16 __LF          \
+        movk    x3, #0x56df, lsl #16 __LF          \
+        movk    x0, #0x9b94, lsl #32 __LF          \
+        movk    x1, #0x149a, lsl #32 __LF          \
+        movk    x2, #0x80f2, lsl #32 __LF          \
+        movk    x3, #0xd9dc, lsl #32 __LF          \
+        movk    x0, #0xebd6, lsl #48 __LF          \
+        movk    x1, #0x00e0, lsl #48 __LF          \
+        movk    x2, #0x198e, lsl #48 __LF          \
+        movk    x3, #0x2406, lsl #48 __LF          \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+S2N_BN_SYMBOL(edwards25519_epadd_alt):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     p3, x0
+        mov     p1, x1
+        mov     p2, x2
+
+// Main sequence
+
+        mul_4(t0,w_1,w_2)
+
+        sub_4(t1,y_1,x_1)
+        sub_4(t2,y_2,x_2)
+        add_4(t3,y_1,x_1)
+        add_4(t4,y_2,x_2)
+        double_4(t5,z_2)
+
+        mul_4(t1,t1,t2)
+        mul_4(t3,t3,t4)
+
+        load_k25519(t2)
+        mul_4(t2,t2,t0)
+
+        mul_4(t4,z_1,t5)
+
+        sub_twice4(t0,t3,t1)
+        add_twice4(t5,t3,t1)
+        sub_twice4(t1,t4,t2)
+        add_twice4(t3,t4,t2)
+
+        mul_p25519(w_3,t0,t5)
+        mul_p25519(x_3,t0,t1)
+        mul_p25519(y_3,t3,t5)
+        mul_p25519(z_3,t1,t3)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble.S
new file mode 100644
index 00000000000..13ac61219b9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble.S
@@ -0,0 +1,494 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective doubling for edwards25519
+// Input p1[12]; output p3[16]
+//
+// extern void edwards25519_epdouble
+//   (uint64_t p3[static 16],uint64_t p1[static 12])
+//
+// If p1 is a point on edwards25519, returns its double p3 = 2 * p1.
+// The output p3 is in extended projective coordinates, representing
+// affine (x,y) by a quadruple (X,Y,Z,T) where x = X / Z, y = Y / Z
+// and x * y = T / Z. The input p1 may also be in the same extended
+// projective representation, but the final T field is not used so
+// a more basic projective triple (X,Y,Z) suffices.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epdouble)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define p3 x17
+#define p1 x19
+
+// Pointers to input and output coordinates
+
+#define x_1 p1, #0
+#define y_1 p1, #NUMSIZE
+#define z_1 p1, #(2*NUMSIZE)
+
+#define x_3 p3, #0
+#define y_3 p3, #NUMSIZE
+#define z_3 p3, #(2*NUMSIZE)
+#define w_3 p3, #(3*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 sp, #(0*NUMSIZE)
+#define t1 sp, #(1*NUMSIZE)
+#define t2 sp, #(2*NUMSIZE)
+#define t3 sp, #(3*NUMSIZE)
+#define t4 sp, #(4*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (5*NUMSIZE)
+
+// Macro wrapping up the basic field operation bignum_mul_p25519, only
+// trivially different from a pure function call to that subroutine.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, cc __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, cc __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x10, #0x26 __LF                    \
+        umull   x12, w6, w10 __LF                  \
+        add     x12, x12, w2, uxtw __LF            \
+        lsr     x2, x2, #32 __LF                   \
+        lsr     x6, x6, #32 __LF                   \
+        umaddl  x6, w6, w10, x2 __LF               \
+        mov     x2, x12 __LF                       \
+        umull   x12, w7, w10 __LF                  \
+        add     x12, x12, w3, uxtw __LF            \
+        lsr     x3, x3, #32 __LF                   \
+        lsr     x7, x7, #32 __LF                   \
+        umaddl  x7, w7, w10, x3 __LF               \
+        mov     x3, x12 __LF                       \
+        umull   x12, w8, w10 __LF                  \
+        add     x12, x12, w4, uxtw __LF            \
+        lsr     x4, x4, #32 __LF                   \
+        lsr     x8, x8, #32 __LF                   \
+        umaddl  x8, w8, w10, x4 __LF               \
+        mov     x4, x12 __LF                       \
+        umull   x12, w9, w10 __LF                  \
+        add     x12, x12, w5, uxtw __LF            \
+        lsr     x5, x5, #32 __LF                   \
+        lsr     x9, x9, #32 __LF                   \
+        umaddl  x9, w9, w10, x5 __LF               \
+        mov     x5, x12 __LF                       \
+        lsr     x13, x9, #31 __LF                  \
+        mov     x11, #0x13 __LF                    \
+        umull   x11, w11, w13 __LF                 \
+        add     x2, x2, x11 __LF                   \
+        adds    x2, x2, x6, lsl #32 __LF           \
+        extr    x10, x7, x6, #32 __LF              \
+        adcs    x3, x3, x10 __LF                   \
+        extr    x10, x8, x7, #32 __LF              \
+        adcs    x4, x4, x10 __LF                   \
+        extr    x10, x9, x8, #32 __LF              \
+        lsl     x11, x13, #63 __LF                 \
+        eor     x5, x5, x11 __LF                   \
+        adc     x5, x5, x10 __LF                   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16]
+
+// Plain 4-digit adding without any normalization.
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        ldp     x4, x5, [P2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+#define double_twice4(P0,P1)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(edwards25519_epdouble):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     p3, x0
+        mov     p1, x1
+
+// Main sequence
+
+        add_4(t0,x_1,y_1)
+        sqr_4(t1,z_1)
+        sqr_4(t2,x_1)
+        sqr_4(t3,y_1)
+        double_twice4(t1,t1)
+        sqr_4(t0,t0)
+        add_twice4(t4,t2,t3)
+        sub_twice4(t2,t2,t3)
+        add_twice4(t3,t1,t2)
+        sub_twice4(t1,t4,t0)
+        mul_p25519(y_3,t2,t4)
+        mul_p25519(z_3,t3,t2)
+        mul_p25519(w_3,t1,t4)
+        mul_p25519(x_3,t1,t3)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble_alt.S
new file mode 100644
index 00000000000..c6b9332c09c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble_alt.S
@@ -0,0 +1,357 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective doubling for edwards25519
+// Input p1[12]; output p3[16]
+//
+// extern void edwards25519_epdouble
+//   (uint64_t p3[static 16],uint64_t p1[static 12])
+//
+// If p1 is a point on edwards25519, returns its double p3 = 2 * p1.
+// The output p3 is in extended projective coordinates, representing
+// affine (x,y) by a quadruple (X,Y,Z,T) where x = X / Z, y = Y / Z
+// and x * y = T / Z. The input p1 may also be in the same extended
+// projective representation, but the final T field is not used so
+// a more basic projective triple (X,Y,Z) suffices.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epdouble_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define p3 x17
+#define p1 x19
+
+// Pointers to input and output coordinates
+
+#define x_1 p1, #0
+#define y_1 p1, #NUMSIZE
+#define z_1 p1, #(2*NUMSIZE)
+
+#define x_3 p3, #0
+#define y_3 p3, #NUMSIZE
+#define z_3 p3, #(2*NUMSIZE)
+#define w_3 p3, #(3*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 sp, #(0*NUMSIZE)
+#define t1 sp, #(1*NUMSIZE)
+#define t2 sp, #(2*NUMSIZE)
+#define t3 sp, #(3*NUMSIZE)
+#define t4 sp, #(4*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (5*NUMSIZE)
+
+// Macro wrapping up the basic field operation bignum_mul_p25519_alt, only
+// trivially different from a pure function call to that subroutine.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x26 __LF                     \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        cmn     x11, x11 __LF                      \
+        bic     x11, x11, #0x8000000000000000 __LF \
+        adc     x2, x12, x12 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        mul     x7, x3, x2 __LF                    \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Plain 4-digit adding without any normalization.
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        ldp     x4, x5, [P2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+#define double_twice4(P0,P1)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(edwards25519_epdouble_alt):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     p3, x0
+        mov     p1, x1
+
+// Main sequence
+
+        add_4(t0,x_1,y_1)
+        sqr_4(t1,z_1)
+        sqr_4(t2,x_1)
+        sqr_4(t3,y_1)
+        double_twice4(t1,t1)
+        sqr_4(t0,t0)
+        add_twice4(t4,t2,t3)
+        sub_twice4(t2,t2,t3)
+        add_twice4(t3,t1,t2)
+        sub_twice4(t1,t4,t0)
+        mul_p25519(y_3,t2,t4)
+        mul_p25519(z_3,t3,t2)
+        mul_p25519(w_3,t1,t4)
+        mul_p25519(x_3,t1,t3)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble.S
new file mode 100644
index 00000000000..c79ab204693
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble.S
@@ -0,0 +1,489 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Projective doubling for edwards25519
+// Input p1[12]; output p3[12]
+//
+// extern void edwards25519_pdouble
+//   (uint64_t p3[static 12],uint64_t p1[static 12])
+//
+// If p1 is a point on edwards25519, returns its double p3 = 2 * p1.
+// Input and output are in pure projective coordinates, representing
+// an affine (x,y) by a triple (X,Y,Z) where x = X / Z, y = Y / Z.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pdouble)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define p3 x17
+#define p1 x19
+
+// Pointers to input and output coordinates
+
+#define x_1 p1, #0
+#define y_1 p1, #NUMSIZE
+#define z_1 p1, #(2*NUMSIZE)
+
+#define x_3 p3, #0
+#define y_3 p3, #NUMSIZE
+#define z_3 p3, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 sp, #(0*NUMSIZE)
+#define t1 sp, #(1*NUMSIZE)
+#define t2 sp, #(2*NUMSIZE)
+#define t3 sp, #(3*NUMSIZE)
+#define t4 sp, #(4*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (5*NUMSIZE)
+
+// Macro wrapping up the basic field operation bignum_mul_p25519, only
+// trivially different from a pure function call to that subroutine.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, cc __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, cc __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x10, #0x26 __LF                    \
+        umull   x12, w6, w10 __LF                  \
+        add     x12, x12, w2, uxtw __LF            \
+        lsr     x2, x2, #32 __LF                   \
+        lsr     x6, x6, #32 __LF                   \
+        umaddl  x6, w6, w10, x2 __LF               \
+        mov     x2, x12 __LF                       \
+        umull   x12, w7, w10 __LF                  \
+        add     x12, x12, w3, uxtw __LF            \
+        lsr     x3, x3, #32 __LF                   \
+        lsr     x7, x7, #32 __LF                   \
+        umaddl  x7, w7, w10, x3 __LF               \
+        mov     x3, x12 __LF                       \
+        umull   x12, w8, w10 __LF                  \
+        add     x12, x12, w4, uxtw __LF            \
+        lsr     x4, x4, #32 __LF                   \
+        lsr     x8, x8, #32 __LF                   \
+        umaddl  x8, w8, w10, x4 __LF               \
+        mov     x4, x12 __LF                       \
+        umull   x12, w9, w10 __LF                  \
+        add     x12, x12, w5, uxtw __LF            \
+        lsr     x5, x5, #32 __LF                   \
+        lsr     x9, x9, #32 __LF                   \
+        umaddl  x9, w9, w10, x5 __LF               \
+        mov     x5, x12 __LF                       \
+        lsr     x13, x9, #31 __LF                  \
+        mov     x11, #0x13 __LF                    \
+        umull   x11, w11, w13 __LF                 \
+        add     x2, x2, x11 __LF                   \
+        adds    x2, x2, x6, lsl #32 __LF           \
+        extr    x10, x7, x6, #32 __LF              \
+        adcs    x3, x3, x10 __LF                   \
+        extr    x10, x8, x7, #32 __LF              \
+        adcs    x4, x4, x10 __LF                   \
+        extr    x10, x9, x8, #32 __LF              \
+        lsl     x11, x13, #63 __LF                 \
+        eor     x5, x5, x11 __LF                   \
+        adc     x5, x5, x10 __LF                   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16]
+
+// Plain 4-digit adding without any normalization.
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        ldp     x4, x5, [P2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+#define double_twice4(P0,P1)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(edwards25519_pdouble):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     p3, x0
+        mov     p1, x1
+
+// Main sequence
+
+        add_4(t0,x_1,y_1)
+        sqr_4(t1,z_1)
+        sqr_4(t2,x_1)
+        sqr_4(t3,y_1)
+        double_twice4(t1,t1)
+        sqr_4(t0,t0)
+        add_twice4(t4,t2,t3)
+        sub_twice4(t2,t2,t3)
+        add_twice4(t3,t1,t2)
+        sub_twice4(t1,t4,t0)
+        mul_p25519(y_3,t2,t4)
+        mul_p25519(z_3,t3,t2)
+        mul_p25519(x_3,t1,t3)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble_alt.S
new file mode 100644
index 00000000000..8b9e75eb925
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble_alt.S
@@ -0,0 +1,352 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Projective doubling for edwards25519
+// Input p1[12]; output p3[12]
+//
+// extern void edwards25519_pdouble
+//   (uint64_t p3[static 12],uint64_t p1[static 12])
+//
+// If p1 is a point on edwards25519, returns its double p3 = 2 * p1.
+// Input and output are in pure projective coordinates, representing
+// an affine (x,y) by a triple (X,Y,Z) where x = X / Z, y = Y / Z.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pdouble_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define p3 x17
+#define p1 x19
+
+// Pointers to input and output coordinates
+
+#define x_1 p1, #0
+#define y_1 p1, #NUMSIZE
+#define z_1 p1, #(2*NUMSIZE)
+
+#define x_3 p3, #0
+#define y_3 p3, #NUMSIZE
+#define z_3 p3, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 sp, #(0*NUMSIZE)
+#define t1 sp, #(1*NUMSIZE)
+#define t2 sp, #(2*NUMSIZE)
+#define t3 sp, #(3*NUMSIZE)
+#define t4 sp, #(4*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (5*NUMSIZE)
+
+// Macro wrapping up the basic field operation bignum_mul_p25519_alt, only
+// trivially different from a pure function call to that subroutine.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x26 __LF                     \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        cmn     x11, x11 __LF                      \
+        bic     x11, x11, #0x8000000000000000 __LF \
+        adc     x2, x12, x12 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        mul     x7, x3, x2 __LF                    \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Plain 4-digit adding without any normalization.
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        ldp     x4, x5, [P2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+#define double_twice4(P0,P1)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(edwards25519_pdouble_alt):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     p3, x0
+        mov     p1, x1
+
+// Main sequence
+
+        add_4(t0,x_1,y_1)
+        sqr_4(t1,z_1)
+        sqr_4(t2,x_1)
+        sqr_4(t3,y_1)
+        double_twice4(t1,t1)
+        sqr_4(t0,t0)
+        add_twice4(t4,t2,t3)
+        sub_twice4(t2,t2,t3)
+        add_twice4(t3,t1,t2)
+        sub_twice4(t1,t4,t0)
+        mul_p25519(y_3,t2,t4)
+        mul_p25519(z_3,t3,t2)
+        mul_p25519(x_3,t1,t3)
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd.S
new file mode 100644
index 00000000000..4fefdfc3693
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd.S
@@ -0,0 +1,562 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective + precomputed mixed addition for edwards25519
+// Inputs p1[16], p2[12]; output p3[16]
+//
+// extern void edwards25519_pepadd
+//   (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12])
+//
+// The output p3 and the first input p1 are points (x,y) on edwards25519
+// represented in extended projective quadruples (X,Y,Z,T) where
+// x = X / Z, y = Y / Z and x * y = T / Z. The second input p2 is a triple
+// encoding its point (x,y) as (y - x,y + x,2 * d * x * y) where d is the
+// usual Edwards curve parameter for edwards25519.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pepadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pepadd)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define p3 x17
+#define p1 x19
+#define p2 x20
+
+// Pointers to input and output coordinates
+
+#define x_1 p1, #0
+#define y_1 p1, #NUMSIZE
+#define z_1 p1, #(2*NUMSIZE)
+#define w_1 p1, #(3*NUMSIZE)
+
+#define ymx_2 p2, #0
+#define xpy_2 p2, #NUMSIZE
+#define kxy_2 p2, #(2*NUMSIZE)
+
+#define x_3 p3, #0
+#define y_3 p3, #NUMSIZE
+#define z_3 p3, #(2*NUMSIZE)
+#define w_3 p3, #(3*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 sp, #(0*NUMSIZE)
+#define t1 sp, #(1*NUMSIZE)
+#define t2 sp, #(2*NUMSIZE)
+#define t3 sp, #(3*NUMSIZE)
+#define t4 sp, #(4*NUMSIZE)
+#define t5 sp, #(5*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (6*NUMSIZE)
+
+// Macro wrapping up the basic field operation bignum_mul_p25519, only
+// trivially different from a pure function call to that subroutine.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umull   x5, w5, w0 __LF                    \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// Plain 4-digit add and doubling without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        ldp     x4, x5, [P2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+#define double_4(P0,P1)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        adds    x0, x0, x0 __LF                    \
+        adcs    x1, x1, x1 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        adcs    x2, x2, x2 __LF                    \
+        adc     x3, x3, x3 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x3, #19 __LF                       \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        mov     x4, #0x8000000000000000 __LF       \
+        sbc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38
+// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(edwards25519_pepadd):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     p3, x0
+        mov     p1, x1
+        mov     p2, x2
+
+// Main sequence
+
+        double_4(t0,z_1);
+
+        sub_4(t1,y_1,x_1);
+        add_4(t2,y_1,x_1);
+
+        mul_4(t3,w_1,kxy_2);
+
+        mul_4(t1,t1,ymx_2);
+        mul_4(t2,t2,xpy_2);
+
+        sub_twice4(t4,t0,t3);
+        add_twice4(t0,t0,t3);
+        sub_twice4(t5,t2,t1);
+        add_twice4(t1,t2,t1);
+
+        mul_p25519(z_3,t4,t0);
+        mul_p25519(x_3,t5,t4);
+        mul_p25519(y_3,t0,t1);
+        mul_p25519(w_3,t5,t1);
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd_alt.S
new file mode 100644
index 00000000000..eb9d55f1adb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd_alt.S
@@ -0,0 +1,404 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective + precomputed mixed addition for edwards25519
+// Inputs p1[16], p2[12]; output p3[16]
+//
+// extern void edwards25519_pepadd_alt
+//   (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12])
+//
+// The output p3 and the first input p1 are points (x,y) on edwards25519
+// represented in extended projective quadruples (X,Y,Z,T) where
+// x = X / Z, y = Y / Z and x * y = T / Z. The second input p2 is a triple
+// encoding its point (x,y) as (y - x,y + x,2 * d * x * y) where d is the
+// usual Edwards curve parameter for edwards25519.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pepadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pepadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define p3 x17
+#define p1 x19
+#define p2 x20
+
+// Pointers to input and output coordinates
+
+#define x_1 p1, #0
+#define y_1 p1, #NUMSIZE
+#define z_1 p1, #(2*NUMSIZE)
+#define w_1 p1, #(3*NUMSIZE)
+
+#define ymx_2 p2, #0
+#define xpy_2 p2, #NUMSIZE
+#define kxy_2 p2, #(2*NUMSIZE)
+
+#define x_3 p3, #0
+#define y_3 p3, #NUMSIZE
+#define z_3 p3, #(2*NUMSIZE)
+#define w_3 p3, #(3*NUMSIZE)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 sp, #(0*NUMSIZE)
+#define t1 sp, #(1*NUMSIZE)
+#define t2 sp, #(2*NUMSIZE)
+#define t3 sp, #(3*NUMSIZE)
+#define t4 sp, #(4*NUMSIZE)
+#define t5 sp, #(5*NUMSIZE)
+
+// Total size to reserve on the stack
+
+#define NSPACE (6*NUMSIZE)
+
+// Macro wrapping up the basic field operation bignum_mul_p25519_alt, only
+// trivially different from a pure function call to that subroutine.
+
+#define mul_p25519(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16]
+
+// Plain 4-digit add and doubling without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        ldp     x4, x5, [P2] __LF                  \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        adcs    x2, x2, x6 __LF                    \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+#define double_4(P0,P1)                         \
+        ldp     x0, x1, [P1] __LF                  \
+        adds    x0, x0, x0 __LF                    \
+        adcs    x1, x1, x1 __LF                    \
+        ldp     x2, x3, [P1+16] __LF               \
+        adcs    x2, x2, x2 __LF                    \
+        adc     x3, x3, x3 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x3, #19 __LF                       \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        mov     x4, #0x8000000000000000 __LF       \
+        sbc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38
+// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519.
+
+#define add_twice4(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(edwards25519_pepadd_alt):
+
+// Save regs and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        sub     sp, sp, #NSPACE
+
+// Move the input arguments to stable places
+
+        mov     p3, x0
+        mov     p1, x1
+        mov     p2, x2
+
+// Main sequence
+
+        double_4(t0,z_1);
+
+        sub_4(t1,y_1,x_1);
+        add_4(t2,y_1,x_1);
+
+        mul_4(t3,w_1,kxy_2);
+
+        mul_4(t1,t1,ymx_2);
+        mul_4(t2,t2,xpy_2);
+
+        sub_twice4(t4,t0,t3);
+        add_twice4(t0,t0,t3);
+        sub_twice4(t5,t2,t1);
+        add_twice4(t1,t2,t1);
+
+        mul_p25519(z_3,t4,t0);
+        mul_p25519(x_3,t5,t4);
+        mul_p25519(y_3,t0,t1);
+        mul_p25519(w_3,t5,t1);
+
+// Restore stack and registers
+
+        add     sp, sp, #NSPACE
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase.S
similarity index 92%
rename from third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase.S
index e00aa7e278a..51be0c8427f 100644
--- a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase.S
@@ -77,391 +77,391 @@
 // Load 64-bit immediate into a register
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 // Macro wrapping up the basic field operation bignum_mul_p25519, only
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        umull   x7, w3, w5;                     \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x8, w16, w0;                    \
-        umull   x16, w3, w16;                   \
-        adds    x7, x7, x15, lsl #32;           \
-        lsr     x15, x15, #32;                  \
-        adc     x8, x8, x15;                    \
-        adds    x7, x7, x16, lsl #32;           \
-        lsr     x16, x16, #32;                  \
-        adc     x8, x8, x16;                    \
-        mul     x9, x4, x6;                     \
-        umulh   x10, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x9, x9, x8;                     \
-        adc     x10, x10, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x8, x7, x9;                     \
-        adcs    x9, x9, x10;                    \
-        adc     x10, x10, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x8, x15, x8;                    \
-        eor     x3, x3, x16;                    \
-        adcs    x9, x3, x9;                     \
-        adc     x10, x10, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P2+16];                \
-        umull   x11, w3, w5;                    \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x12, w16, w0;                   \
-        umull   x16, w3, w16;                   \
-        adds    x11, x11, x15, lsl #32;         \
-        lsr     x15, x15, #32;                  \
-        adc     x12, x12, x15;                  \
-        adds    x11, x11, x16, lsl #32;         \
-        lsr     x16, x16, #32;                  \
-        adc     x12, x12, x16;                  \
-        mul     x13, x4, x6;                    \
-        umulh   x14, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x13, x13, x12;                  \
-        adc     x14, x14, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x12, x11, x13;                  \
-        adcs    x13, x13, x14;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x12, x15, x12;                  \
-        eor     x3, x3, x16;                    \
-        adcs    x13, x3, x13;                   \
-        adc     x14, x14, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x15, x16, [P1];                 \
-        subs    x3, x3, x15;                    \
-        sbcs    x4, x4, x16;                    \
-        csetm   x16, cc;                        \
-        ldp     x15, x0, [P2];                  \
-        subs    x5, x15, x5;                    \
-        sbcs    x6, x0, x6;                     \
-        csetm   x0, cc;                         \
-        eor     x3, x3, x16;                    \
-        subs    x3, x3, x16;                    \
-        eor     x4, x4, x16;                    \
-        sbc     x4, x4, x16;                    \
-        eor     x5, x5, x0;                     \
-        subs    x5, x5, x0;                     \
-        eor     x6, x6, x0;                     \
-        sbc     x6, x6, x0;                     \
-        eor     x16, x0, x16;                   \
-        adds    x11, x11, x9;                   \
-        adcs    x12, x12, x10;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        mul     x2, x3, x5;                     \
-        umulh   x0, x3, x5;                     \
-        mul     x15, x4, x6;                    \
-        umulh   x1, x4, x6;                     \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x9, cc;                         \
-        adds    x15, x15, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x6, x5, x6;                     \
-        cneg    x6, x6, cc;                     \
-        cinv    x9, x9, cc;                     \
-        mul     x5, x4, x6;                     \
-        umulh   x6, x4, x6;                     \
-        adds    x0, x2, x15;                    \
-        adcs    x15, x15, x1;                   \
-        adc     x1, x1, xzr;                    \
-        cmn     x9, #0x1;                       \
-        eor     x5, x5, x9;                     \
-        adcs    x0, x5, x0;                     \
-        eor     x6, x6, x9;                     \
-        adcs    x15, x6, x15;                   \
-        adc     x1, x1, x9;                     \
-        adds    x9, x11, x7;                    \
-        adcs    x10, x12, x8;                   \
-        adcs    x11, x13, x11;                  \
-        adcs    x12, x14, x12;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x2, x2, x16;                    \
-        adcs    x9, x2, x9;                     \
-        eor     x0, x0, x16;                    \
-        adcs    x10, x0, x10;                   \
-        eor     x15, x15, x16;                  \
-        adcs    x11, x15, x11;                  \
-        eor     x1, x1, x16;                    \
-        adcs    x12, x1, x12;                   \
-        adcs    x13, x13, x16;                  \
-        adc     x14, x14, x16;                  \
-        mov     x3, #0x26;                      \
-        umull   x4, w11, w3;                    \
-        add     x4, x4, w7, uxtw;               \
-        lsr     x7, x7, #32;                    \
-        lsr     x11, x11, #32;                  \
-        umaddl  x11, w11, w3, x7;               \
-        mov     x7, x4;                         \
-        umull   x4, w12, w3;                    \
-        add     x4, x4, w8, uxtw;               \
-        lsr     x8, x8, #32;                    \
-        lsr     x12, x12, #32;                  \
-        umaddl  x12, w12, w3, x8;               \
-        mov     x8, x4;                         \
-        umull   x4, w13, w3;                    \
-        add     x4, x4, w9, uxtw;               \
-        lsr     x9, x9, #32;                    \
-        lsr     x13, x13, #32;                  \
-        umaddl  x13, w13, w3, x9;               \
-        mov     x9, x4;                         \
-        umull   x4, w14, w3;                    \
-        add     x4, x4, w10, uxtw;              \
-        lsr     x10, x10, #32;                  \
-        lsr     x14, x14, #32;                  \
-        umaddl  x14, w14, w3, x10;              \
-        mov     x10, x4;                        \
-        lsr     x0, x14, #31;                   \
-        mov     x5, #0x13;                      \
-        umaddl  x5, w5, w0, x5;                 \
-        add     x7, x7, x5;                     \
-        adds    x7, x7, x11, lsl #32;           \
-        extr    x3, x12, x11, #32;              \
-        adcs    x8, x8, x3;                     \
-        extr    x3, x13, x12, #32;              \
-        adcs    x9, x9, x3;                     \
-        extr    x3, x14, x13, #32;              \
-        lsl     x5, x0, #63;                    \
-        eor     x10, x10, x5;                   \
-        adc     x10, x10, x3;                   \
-        mov     x3, #0x13;                      \
-        tst     x10, #0x8000000000000000;       \
-        csel    x3, x3, xzr, pl;                \
-        subs    x7, x7, x3;                     \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        and     x10, x10, #0x7fffffffffffffff;  \
-        stp     x7, x8, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
         stp     x9, x10, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        umull   x7, w3, w5;                     \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x8, w16, w0;                    \
-        umull   x16, w3, w16;                   \
-        adds    x7, x7, x15, lsl #32;           \
-        lsr     x15, x15, #32;                  \
-        adc     x8, x8, x15;                    \
-        adds    x7, x7, x16, lsl #32;           \
-        lsr     x16, x16, #32;                  \
-        adc     x8, x8, x16;                    \
-        mul     x9, x4, x6;                     \
-        umulh   x10, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x9, x9, x8;                     \
-        adc     x10, x10, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x8, x7, x9;                     \
-        adcs    x9, x9, x10;                    \
-        adc     x10, x10, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x8, x15, x8;                    \
-        eor     x3, x3, x16;                    \
-        adcs    x9, x3, x9;                     \
-        adc     x10, x10, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P2+16];                \
-        umull   x11, w3, w5;                    \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x12, w16, w0;                   \
-        umull   x16, w3, w16;                   \
-        adds    x11, x11, x15, lsl #32;         \
-        lsr     x15, x15, #32;                  \
-        adc     x12, x12, x15;                  \
-        adds    x11, x11, x16, lsl #32;         \
-        lsr     x16, x16, #32;                  \
-        adc     x12, x12, x16;                  \
-        mul     x13, x4, x6;                    \
-        umulh   x14, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x13, x13, x12;                  \
-        adc     x14, x14, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x12, x11, x13;                  \
-        adcs    x13, x13, x14;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x12, x15, x12;                  \
-        eor     x3, x3, x16;                    \
-        adcs    x13, x3, x13;                   \
-        adc     x14, x14, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x15, x16, [P1];                 \
-        subs    x3, x3, x15;                    \
-        sbcs    x4, x4, x16;                    \
-        csetm   x16, cc;                        \
-        ldp     x15, x0, [P2];                  \
-        subs    x5, x15, x5;                    \
-        sbcs    x6, x0, x6;                     \
-        csetm   x0, cc;                         \
-        eor     x3, x3, x16;                    \
-        subs    x3, x3, x16;                    \
-        eor     x4, x4, x16;                    \
-        sbc     x4, x4, x16;                    \
-        eor     x5, x5, x0;                     \
-        subs    x5, x5, x0;                     \
-        eor     x6, x6, x0;                     \
-        sbc     x6, x6, x0;                     \
-        eor     x16, x0, x16;                   \
-        adds    x11, x11, x9;                   \
-        adcs    x12, x12, x10;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        mul     x2, x3, x5;                     \
-        umulh   x0, x3, x5;                     \
-        mul     x15, x4, x6;                    \
-        umulh   x1, x4, x6;                     \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x9, cc;                         \
-        adds    x15, x15, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x6, x5, x6;                     \
-        cneg    x6, x6, cc;                     \
-        cinv    x9, x9, cc;                     \
-        mul     x5, x4, x6;                     \
-        umulh   x6, x4, x6;                     \
-        adds    x0, x2, x15;                    \
-        adcs    x15, x15, x1;                   \
-        adc     x1, x1, xzr;                    \
-        cmn     x9, #0x1;                       \
-        eor     x5, x5, x9;                     \
-        adcs    x0, x5, x0;                     \
-        eor     x6, x6, x9;                     \
-        adcs    x15, x6, x15;                   \
-        adc     x1, x1, x9;                     \
-        adds    x9, x11, x7;                    \
-        adcs    x10, x12, x8;                   \
-        adcs    x11, x13, x11;                  \
-        adcs    x12, x14, x12;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x2, x2, x16;                    \
-        adcs    x9, x2, x9;                     \
-        eor     x0, x0, x16;                    \
-        adcs    x10, x0, x10;                   \
-        eor     x15, x15, x16;                  \
-        adcs    x11, x15, x11;                  \
-        eor     x1, x1, x16;                    \
-        adcs    x12, x1, x12;                   \
-        adcs    x13, x13, x16;                  \
-        adc     x14, x14, x16;                  \
-        mov     x3, #0x26;                      \
-        umull   x4, w11, w3;                    \
-        add     x4, x4, w7, uxtw;               \
-        lsr     x7, x7, #32;                    \
-        lsr     x11, x11, #32;                  \
-        umaddl  x11, w11, w3, x7;               \
-        mov     x7, x4;                         \
-        umull   x4, w12, w3;                    \
-        add     x4, x4, w8, uxtw;               \
-        lsr     x8, x8, #32;                    \
-        lsr     x12, x12, #32;                  \
-        umaddl  x12, w12, w3, x8;               \
-        mov     x8, x4;                         \
-        umull   x4, w13, w3;                    \
-        add     x4, x4, w9, uxtw;               \
-        lsr     x9, x9, #32;                    \
-        lsr     x13, x13, #32;                  \
-        umaddl  x13, w13, w3, x9;               \
-        mov     x9, x4;                         \
-        umull   x4, w14, w3;                    \
-        add     x4, x4, w10, uxtw;              \
-        lsr     x10, x10, #32;                  \
-        lsr     x14, x14, #32;                  \
-        umaddl  x14, w14, w3, x10;              \
-        mov     x10, x4;                        \
-        lsr     x0, x14, #31;                   \
-        mov     x5, #0x13;                      \
-        umull   x5, w5, w0;                     \
-        add     x7, x7, x5;                     \
-        adds    x7, x7, x11, lsl #32;           \
-        extr    x3, x12, x11, #32;              \
-        adcs    x8, x8, x3;                     \
-        extr    x3, x13, x12, #32;              \
-        adcs    x9, x9, x3;                     \
-        extr    x3, x14, x13, #32;              \
-        lsl     x5, x0, #63;                    \
-        eor     x10, x10, x5;                   \
-        adc     x10, x10, x3;                   \
-        stp     x7, x8, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umull   x5, w5, w0 __LF                    \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        stp     x7, x8, [P0] __LF                  \
         stp     x9, x10, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(P0,P1,P2)                    \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [P0];                   \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
         stp     x7, x8, [P0+16]
 
 // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
@@ -471,37 +471,37 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 #define double_twice4(P0,P1)                    \
-        ldp     x3, x4, [P1];                   \
-        adds    x3, x3, x3;                     \
-        adcs    x4, x4, x4;                     \
-        ldp     x5, x6, [P1+16];                \
-        adcs    x5, x5, x5;                     \
-        adcs    x6, x6, x6;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 S2N_BN_SYMBOL(edwards25519_scalarmulbase):
diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase_alt.S
similarity index 95%
rename from third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase_alt.S
index 2ffc7799edb..726ae766dfc 100644
--- a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase_alt.S
@@ -77,233 +77,233 @@
 // Load 64-bit immediate into a register
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 // Macro wrapping up the basic field operation bignum_mul_p25519_alt, only
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        orr     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        madd    x11, x7, x8, x7;                \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adcs    x15, x15, xzr;                  \
-        csel    x7, x7, xzr, cc;                \
-        subs    x12, x12, x7;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbc     x15, x15, xzr;                  \
-        and     x15, x15, #0x7fffffffffffffff;  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        bic     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        mul     x11, x7, x8;                    \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adc     x15, x15, xzr;                  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(P0,P1,P2)                    \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [P0];                   \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
         stp     x7, x8, [P0+16]
 
 // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
@@ -313,37 +313,37 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 #define double_twice4(P0,P1)                    \
-        ldp     x3, x4, [P1];                   \
-        adds    x3, x3, x3;                     \
-        adcs    x4, x4, x4;                     \
-        ldp     x5, x6, [P1+16];                \
-        adcs    x5, x5, x5;                     \
-        adcs    x6, x6, x6;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt):
diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble.S
similarity index 73%
rename from third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble.S
index d8c6e21c6e3..3d51f22d3d2 100644
--- a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble.S
@@ -99,371 +99,371 @@
 // Load 64-bit immediate into a register
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 // Macro wrapping up the basic field operation bignum_mul_p25519, only
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        umull   x7, w3, w5;                     \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x8, w16, w0;                    \
-        umull   x16, w3, w16;                   \
-        adds    x7, x7, x15, lsl #32;           \
-        lsr     x15, x15, #32;                  \
-        adc     x8, x8, x15;                    \
-        adds    x7, x7, x16, lsl #32;           \
-        lsr     x16, x16, #32;                  \
-        adc     x8, x8, x16;                    \
-        mul     x9, x4, x6;                     \
-        umulh   x10, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x9, x9, x8;                     \
-        adc     x10, x10, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x8, x7, x9;                     \
-        adcs    x9, x9, x10;                    \
-        adc     x10, x10, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x8, x15, x8;                    \
-        eor     x3, x3, x16;                    \
-        adcs    x9, x3, x9;                     \
-        adc     x10, x10, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P2+16];                \
-        umull   x11, w3, w5;                    \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x12, w16, w0;                   \
-        umull   x16, w3, w16;                   \
-        adds    x11, x11, x15, lsl #32;         \
-        lsr     x15, x15, #32;                  \
-        adc     x12, x12, x15;                  \
-        adds    x11, x11, x16, lsl #32;         \
-        lsr     x16, x16, #32;                  \
-        adc     x12, x12, x16;                  \
-        mul     x13, x4, x6;                    \
-        umulh   x14, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x13, x13, x12;                  \
-        adc     x14, x14, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x12, x11, x13;                  \
-        adcs    x13, x13, x14;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x12, x15, x12;                  \
-        eor     x3, x3, x16;                    \
-        adcs    x13, x3, x13;                   \
-        adc     x14, x14, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x15, x16, [P1];                 \
-        subs    x3, x3, x15;                    \
-        sbcs    x4, x4, x16;                    \
-        csetm   x16, cc;                        \
-        ldp     x15, x0, [P2];                  \
-        subs    x5, x15, x5;                    \
-        sbcs    x6, x0, x6;                     \
-        csetm   x0, cc;                         \
-        eor     x3, x3, x16;                    \
-        subs    x3, x3, x16;                    \
-        eor     x4, x4, x16;                    \
-        sbc     x4, x4, x16;                    \
-        eor     x5, x5, x0;                     \
-        subs    x5, x5, x0;                     \
-        eor     x6, x6, x0;                     \
-        sbc     x6, x6, x0;                     \
-        eor     x16, x0, x16;                   \
-        adds    x11, x11, x9;                   \
-        adcs    x12, x12, x10;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        mul     x2, x3, x5;                     \
-        umulh   x0, x3, x5;                     \
-        mul     x15, x4, x6;                    \
-        umulh   x1, x4, x6;                     \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x9, cc;                         \
-        adds    x15, x15, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x6, x5, x6;                     \
-        cneg    x6, x6, cc;                     \
-        cinv    x9, x9, cc;                     \
-        mul     x5, x4, x6;                     \
-        umulh   x6, x4, x6;                     \
-        adds    x0, x2, x15;                    \
-        adcs    x15, x15, x1;                   \
-        adc     x1, x1, xzr;                    \
-        cmn     x9, #0x1;                       \
-        eor     x5, x5, x9;                     \
-        adcs    x0, x5, x0;                     \
-        eor     x6, x6, x9;                     \
-        adcs    x15, x6, x15;                   \
-        adc     x1, x1, x9;                     \
-        adds    x9, x11, x7;                    \
-        adcs    x10, x12, x8;                   \
-        adcs    x11, x13, x11;                  \
-        adcs    x12, x14, x12;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x2, x2, x16;                    \
-        adcs    x9, x2, x9;                     \
-        eor     x0, x0, x16;                    \
-        adcs    x10, x0, x10;                   \
-        eor     x15, x15, x16;                  \
-        adcs    x11, x15, x11;                  \
-        eor     x1, x1, x16;                    \
-        adcs    x12, x1, x12;                   \
-        adcs    x13, x13, x16;                  \
-        adc     x14, x14, x16;                  \
-        mov     x3, #0x26;                      \
-        umull   x4, w11, w3;                    \
-        add     x4, x4, w7, uxtw;               \
-        lsr     x7, x7, #32;                    \
-        lsr     x11, x11, #32;                  \
-        umaddl  x11, w11, w3, x7;               \
-        mov     x7, x4;                         \
-        umull   x4, w12, w3;                    \
-        add     x4, x4, w8, uxtw;               \
-        lsr     x8, x8, #32;                    \
-        lsr     x12, x12, #32;                  \
-        umaddl  x12, w12, w3, x8;               \
-        mov     x8, x4;                         \
-        umull   x4, w13, w3;                    \
-        add     x4, x4, w9, uxtw;               \
-        lsr     x9, x9, #32;                    \
-        lsr     x13, x13, #32;                  \
-        umaddl  x13, w13, w3, x9;               \
-        mov     x9, x4;                         \
-        umull   x4, w14, w3;                    \
-        add     x4, x4, w10, uxtw;              \
-        lsr     x10, x10, #32;                  \
-        lsr     x14, x14, #32;                  \
-        umaddl  x14, w14, w3, x10;              \
-        mov     x10, x4;                        \
-        lsr     x0, x14, #31;                   \
-        mov     x5, #0x13;                      \
-        umaddl  x5, w5, w0, x5;                 \
-        add     x7, x7, x5;                     \
-        adds    x7, x7, x11, lsl #32;           \
-        extr    x3, x12, x11, #32;              \
-        adcs    x8, x8, x3;                     \
-        extr    x3, x13, x12, #32;              \
-        adcs    x9, x9, x3;                     \
-        extr    x3, x14, x13, #32;              \
-        lsl     x5, x0, #63;                    \
-        eor     x10, x10, x5;                   \
-        adc     x10, x10, x3;                   \
-        mov     x3, #0x13;                      \
-        tst     x10, #0x8000000000000000;       \
-        csel    x3, x3, xzr, pl;                \
-        subs    x7, x7, x3;                     \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbc     x10, x10, xzr;                  \
-        and     x10, x10, #0x7fffffffffffffff;  \
-        stp     x7, x8, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umaddl  x5, w5, w0, x5 __LF                \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        tst     x10, #0x8000000000000000 __LF      \
+        csel    x3, x3, xzr, pl __LF               \
+        subs    x7, x7, x3 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        and     x10, x10, #0x7fffffffffffffff __LF \
+        stp     x7, x8, [P0] __LF                  \
         stp     x9, x10, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x5, x6, [P2];                   \
-        umull   x7, w3, w5;                     \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x8, w16, w0;                    \
-        umull   x16, w3, w16;                   \
-        adds    x7, x7, x15, lsl #32;           \
-        lsr     x15, x15, #32;                  \
-        adc     x8, x8, x15;                    \
-        adds    x7, x7, x16, lsl #32;           \
-        lsr     x16, x16, #32;                  \
-        adc     x8, x8, x16;                    \
-        mul     x9, x4, x6;                     \
-        umulh   x10, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x9, x9, x8;                     \
-        adc     x10, x10, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x8, x7, x9;                     \
-        adcs    x9, x9, x10;                    \
-        adc     x10, x10, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x8, x15, x8;                    \
-        eor     x3, x3, x16;                    \
-        adcs    x9, x3, x9;                     \
-        adc     x10, x10, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P2+16];                \
-        umull   x11, w3, w5;                    \
-        lsr     x0, x3, #32;                    \
-        umull   x15, w0, w5;                    \
-        lsr     x16, x5, #32;                   \
-        umull   x12, w16, w0;                   \
-        umull   x16, w3, w16;                   \
-        adds    x11, x11, x15, lsl #32;         \
-        lsr     x15, x15, #32;                  \
-        adc     x12, x12, x15;                  \
-        adds    x11, x11, x16, lsl #32;         \
-        lsr     x16, x16, #32;                  \
-        adc     x12, x12, x16;                  \
-        mul     x13, x4, x6;                    \
-        umulh   x14, x4, x6;                    \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x16, cc;                        \
-        adds    x13, x13, x12;                  \
-        adc     x14, x14, xzr;                  \
-        subs    x3, x5, x6;                     \
-        cneg    x3, x3, cc;                     \
-        cinv    x16, x16, cc;                   \
-        mul     x15, x4, x3;                    \
-        umulh   x3, x4, x3;                     \
-        adds    x12, x11, x13;                  \
-        adcs    x13, x13, x14;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x15, x15, x16;                  \
-        adcs    x12, x15, x12;                  \
-        eor     x3, x3, x16;                    \
-        adcs    x13, x3, x13;                   \
-        adc     x14, x14, x16;                  \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x15, x16, [P1];                 \
-        subs    x3, x3, x15;                    \
-        sbcs    x4, x4, x16;                    \
-        csetm   x16, cc;                        \
-        ldp     x15, x0, [P2];                  \
-        subs    x5, x15, x5;                    \
-        sbcs    x6, x0, x6;                     \
-        csetm   x0, cc;                         \
-        eor     x3, x3, x16;                    \
-        subs    x3, x3, x16;                    \
-        eor     x4, x4, x16;                    \
-        sbc     x4, x4, x16;                    \
-        eor     x5, x5, x0;                     \
-        subs    x5, x5, x0;                     \
-        eor     x6, x6, x0;                     \
-        sbc     x6, x6, x0;                     \
-        eor     x16, x0, x16;                   \
-        adds    x11, x11, x9;                   \
-        adcs    x12, x12, x10;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        mul     x2, x3, x5;                     \
-        umulh   x0, x3, x5;                     \
-        mul     x15, x4, x6;                    \
-        umulh   x1, x4, x6;                     \
-        subs    x4, x4, x3;                     \
-        cneg    x4, x4, cc;                     \
-        csetm   x9, cc;                         \
-        adds    x15, x15, x0;                   \
-        adc     x1, x1, xzr;                    \
-        subs    x6, x5, x6;                     \
-        cneg    x6, x6, cc;                     \
-        cinv    x9, x9, cc;                     \
-        mul     x5, x4, x6;                     \
-        umulh   x6, x4, x6;                     \
-        adds    x0, x2, x15;                    \
-        adcs    x15, x15, x1;                   \
-        adc     x1, x1, xzr;                    \
-        cmn     x9, #0x1;                       \
-        eor     x5, x5, x9;                     \
-        adcs    x0, x5, x0;                     \
-        eor     x6, x6, x9;                     \
-        adcs    x15, x6, x15;                   \
-        adc     x1, x1, x9;                     \
-        adds    x9, x11, x7;                    \
-        adcs    x10, x12, x8;                   \
-        adcs    x11, x13, x11;                  \
-        adcs    x12, x14, x12;                  \
-        adcs    x13, x13, xzr;                  \
-        adc     x14, x14, xzr;                  \
-        cmn     x16, #0x1;                      \
-        eor     x2, x2, x16;                    \
-        adcs    x9, x2, x9;                     \
-        eor     x0, x0, x16;                    \
-        adcs    x10, x0, x10;                   \
-        eor     x15, x15, x16;                  \
-        adcs    x11, x15, x11;                  \
-        eor     x1, x1, x16;                    \
-        adcs    x12, x1, x12;                   \
-        adcs    x13, x13, x16;                  \
-        adc     x14, x14, x16;                  \
-        mov     x3, #0x26;                      \
-        umull   x4, w11, w3;                    \
-        add     x4, x4, w7, uxtw;               \
-        lsr     x7, x7, #32;                    \
-        lsr     x11, x11, #32;                  \
-        umaddl  x11, w11, w3, x7;               \
-        mov     x7, x4;                         \
-        umull   x4, w12, w3;                    \
-        add     x4, x4, w8, uxtw;               \
-        lsr     x8, x8, #32;                    \
-        lsr     x12, x12, #32;                  \
-        umaddl  x12, w12, w3, x8;               \
-        mov     x8, x4;                         \
-        umull   x4, w13, w3;                    \
-        add     x4, x4, w9, uxtw;               \
-        lsr     x9, x9, #32;                    \
-        lsr     x13, x13, #32;                  \
-        umaddl  x13, w13, w3, x9;               \
-        mov     x9, x4;                         \
-        umull   x4, w14, w3;                    \
-        add     x4, x4, w10, uxtw;              \
-        lsr     x10, x10, #32;                  \
-        lsr     x14, x14, #32;                  \
-        umaddl  x14, w14, w3, x10;              \
-        mov     x10, x4;                        \
-        lsr     x0, x14, #31;                   \
-        mov     x5, #0x13;                      \
-        umull   x5, w5, w0;                     \
-        add     x7, x7, x5;                     \
-        adds    x7, x7, x11, lsl #32;           \
-        extr    x3, x12, x11, #32;              \
-        adcs    x8, x8, x3;                     \
-        extr    x3, x13, x12, #32;              \
-        adcs    x9, x9, x3;                     \
-        extr    x3, x14, x13, #32;              \
-        lsl     x5, x0, #63;                    \
-        eor     x10, x10, x5;                   \
-        adc     x10, x10, x3;                   \
-        stp     x7, x8, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        umull   x7, w3, w5 __LF                    \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x8, w16, w0 __LF                   \
+        umull   x16, w3, w16 __LF                  \
+        adds    x7, x7, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x8, x8, x15 __LF                   \
+        adds    x7, x7, x16, lsl #32 __LF          \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x8, x8, x16 __LF                   \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        umull   x11, w3, w5 __LF                   \
+        lsr     x0, x3, #32 __LF                   \
+        umull   x15, w0, w5 __LF                   \
+        lsr     x16, x5, #32 __LF                  \
+        umull   x12, w16, w0 __LF                  \
+        umull   x16, w3, w16 __LF                  \
+        adds    x11, x11, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x12, x12, x15 __LF                 \
+        adds    x11, x11, x16, lsl #32 __LF        \
+        lsr     x16, x16, #32 __LF                 \
+        adc     x12, x12, x16 __LF                 \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x16, cc __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, cc __LF                    \
+        cinv    x16, x16, cc __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, cc __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, cc __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, cc __LF                    \
+        csetm   x9, cc __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, cc __LF                    \
+        cinv    x9, x9, cc __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #0x1 __LF                      \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #0x1 __LF                     \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x3, #0x26 __LF                     \
+        umull   x4, w11, w3 __LF                   \
+        add     x4, x4, w7, uxtw __LF              \
+        lsr     x7, x7, #32 __LF                   \
+        lsr     x11, x11, #32 __LF                 \
+        umaddl  x11, w11, w3, x7 __LF              \
+        mov     x7, x4 __LF                        \
+        umull   x4, w12, w3 __LF                   \
+        add     x4, x4, w8, uxtw __LF              \
+        lsr     x8, x8, #32 __LF                   \
+        lsr     x12, x12, #32 __LF                 \
+        umaddl  x12, w12, w3, x8 __LF              \
+        mov     x8, x4 __LF                        \
+        umull   x4, w13, w3 __LF                   \
+        add     x4, x4, w9, uxtw __LF              \
+        lsr     x9, x9, #32 __LF                   \
+        lsr     x13, x13, #32 __LF                 \
+        umaddl  x13, w13, w3, x9 __LF              \
+        mov     x9, x4 __LF                        \
+        umull   x4, w14, w3 __LF                   \
+        add     x4, x4, w10, uxtw __LF             \
+        lsr     x10, x10, #32 __LF                 \
+        lsr     x14, x14, #32 __LF                 \
+        umaddl  x14, w14, w3, x10 __LF             \
+        mov     x10, x4 __LF                       \
+        lsr     x0, x14, #31 __LF                  \
+        mov     x5, #0x13 __LF                     \
+        umull   x5, w5, w0 __LF                    \
+        add     x7, x7, x5 __LF                    \
+        adds    x7, x7, x11, lsl #32 __LF          \
+        extr    x3, x12, x11, #32 __LF             \
+        adcs    x8, x8, x3 __LF                    \
+        extr    x3, x13, x12, #32 __LF             \
+        adcs    x9, x9, x3 __LF                    \
+        extr    x3, x14, x13, #32 __LF             \
+        lsl     x5, x0, #63 __LF                   \
+        eor     x10, x10, x5 __LF                  \
+        adc     x10, x10, x3 __LF                  \
+        stp     x7, x8, [P0] __LF                  \
         stp     x9, x10, [P0+16]
 
 // Squaring just giving a result < 2 * p_25519, which is done by
@@ -471,155 +471,155 @@
 // optional correction.
 
 #define sqr_4(P0,P1)                            \
-        ldp     x10, x11, [P1];                 \
-        ldp     x12, x13, [P1+16];              \
-        umull   x2, w10, w10;                   \
-        lsr     x14, x10, #32;                  \
-        umull   x3, w14, w14;                   \
-        umull   x14, w10, w14;                  \
-        adds    x2, x2, x14, lsl #33;           \
-        lsr     x14, x14, #31;                  \
-        adc     x3, x3, x14;                    \
-        umull   x4, w11, w11;                   \
-        lsr     x14, x11, #32;                  \
-        umull   x5, w14, w14;                   \
-        umull   x14, w11, w14;                  \
-        mul     x15, x10, x11;                  \
-        umulh   x16, x10, x11;                  \
-        adds    x4, x4, x14, lsl #33;           \
-        lsr     x14, x14, #31;                  \
-        adc     x5, x5, x14;                    \
-        adds    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adc     x5, x5, xzr;                    \
-        adds    x3, x3, x15;                    \
-        adcs    x4, x4, x16;                    \
-        adc     x5, x5, xzr;                    \
-        umull   x6, w12, w12;                   \
-        lsr     x14, x12, #32;                  \
-        umull   x7, w14, w14;                   \
-        umull   x14, w12, w14;                  \
-        adds    x6, x6, x14, lsl #33;           \
-        lsr     x14, x14, #31;                  \
-        adc     x7, x7, x14;                    \
-        umull   x8, w13, w13;                   \
-        lsr     x14, x13, #32;                  \
-        umull   x9, w14, w14;                   \
-        umull   x14, w13, w14;                  \
-        mul     x15, x12, x13;                  \
-        umulh   x16, x12, x13;                  \
-        adds    x8, x8, x14, lsl #33;           \
-        lsr     x14, x14, #31;                  \
-        adc     x9, x9, x14;                    \
-        adds    x15, x15, x15;                  \
-        adcs    x16, x16, x16;                  \
-        adc     x9, x9, xzr;                    \
-        adds    x7, x7, x15;                    \
-        adcs    x8, x8, x16;                    \
-        adc     x9, x9, xzr;                    \
-        subs    x10, x10, x12;                  \
-        sbcs    x11, x11, x13;                  \
-        csetm   x16, cc;                        \
-        eor     x10, x10, x16;                  \
-        subs    x10, x10, x16;                  \
-        eor     x11, x11, x16;                  \
-        sbc     x11, x11, x16;                  \
-        adds    x6, x6, x4;                     \
-        adcs    x7, x7, x5;                     \
-        adcs    x8, x8, xzr;                    \
-        adc     x9, x9, xzr;                    \
-        umull   x12, w10, w10;                  \
-        lsr     x5, x10, #32;                   \
-        umull   x13, w5, w5;                    \
-        umull   x5, w10, w5;                    \
-        adds    x12, x12, x5, lsl #33;          \
-        lsr     x5, x5, #31;                    \
-        adc     x13, x13, x5;                   \
-        umull   x15, w11, w11;                  \
-        lsr     x5, x11, #32;                   \
-        umull   x14, w5, w5;                    \
-        umull   x5, w11, w5;                    \
-        mul     x4, x10, x11;                   \
-        umulh   x16, x10, x11;                  \
-        adds    x15, x15, x5, lsl #33;          \
-        lsr     x5, x5, #31;                    \
-        adc     x14, x14, x5;                   \
-        adds    x4, x4, x4;                     \
-        adcs    x16, x16, x16;                  \
-        adc     x14, x14, xzr;                  \
-        adds    x13, x13, x4;                   \
-        adcs    x15, x15, x16;                  \
-        adc     x14, x14, xzr;                  \
-        adds    x4, x2, x6;                     \
-        adcs    x5, x3, x7;                     \
-        adcs    x6, x6, x8;                     \
-        adcs    x7, x7, x9;                     \
-        csetm   x16, cc;                        \
-        subs    x4, x4, x12;                    \
-        sbcs    x5, x5, x13;                    \
-        sbcs    x6, x6, x15;                    \
-        sbcs    x7, x7, x14;                    \
-        adcs    x8, x8, x16;                    \
-        adc     x9, x9, x16;                    \
-        mov     x10, #0x26;                     \
-        umull   x12, w6, w10;                   \
-        add     x12, x12, w2, uxtw;             \
-        lsr     x2, x2, #32;                    \
-        lsr     x6, x6, #32;                    \
-        umaddl  x6, w6, w10, x2;                \
-        mov     x2, x12;                        \
-        umull   x12, w7, w10;                   \
-        add     x12, x12, w3, uxtw;             \
-        lsr     x3, x3, #32;                    \
-        lsr     x7, x7, #32;                    \
-        umaddl  x7, w7, w10, x3;                \
-        mov     x3, x12;                        \
-        umull   x12, w8, w10;                   \
-        add     x12, x12, w4, uxtw;             \
-        lsr     x4, x4, #32;                    \
-        lsr     x8, x8, #32;                    \
-        umaddl  x8, w8, w10, x4;                \
-        mov     x4, x12;                        \
-        umull   x12, w9, w10;                   \
-        add     x12, x12, w5, uxtw;             \
-        lsr     x5, x5, #32;                    \
-        lsr     x9, x9, #32;                    \
-        umaddl  x9, w9, w10, x5;                \
-        mov     x5, x12;                        \
-        lsr     x13, x9, #31;                   \
-        mov     x11, #0x13;                     \
-        umull   x11, w11, w13;                  \
-        add     x2, x2, x11;                    \
-        adds    x2, x2, x6, lsl #32;            \
-        extr    x10, x7, x6, #32;               \
-        adcs    x3, x3, x10;                    \
-        extr    x10, x8, x7, #32;               \
-        adcs    x4, x4, x10;                    \
-        extr    x10, x9, x8, #32;               \
-        lsl     x11, x13, #63;                  \
-        eor     x5, x5, x11;                    \
-        adc     x5, x5, x10;                    \
-        stp     x2, x3, [P0];                   \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, cc __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, cc __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x10, #0x26 __LF                    \
+        umull   x12, w6, w10 __LF                  \
+        add     x12, x12, w2, uxtw __LF            \
+        lsr     x2, x2, #32 __LF                   \
+        lsr     x6, x6, #32 __LF                   \
+        umaddl  x6, w6, w10, x2 __LF               \
+        mov     x2, x12 __LF                       \
+        umull   x12, w7, w10 __LF                  \
+        add     x12, x12, w3, uxtw __LF            \
+        lsr     x3, x3, #32 __LF                   \
+        lsr     x7, x7, #32 __LF                   \
+        umaddl  x7, w7, w10, x3 __LF               \
+        mov     x3, x12 __LF                       \
+        umull   x12, w8, w10 __LF                  \
+        add     x12, x12, w4, uxtw __LF            \
+        lsr     x4, x4, #32 __LF                   \
+        lsr     x8, x8, #32 __LF                   \
+        umaddl  x8, w8, w10, x4 __LF               \
+        mov     x4, x12 __LF                       \
+        umull   x12, w9, w10 __LF                  \
+        add     x12, x12, w5, uxtw __LF            \
+        lsr     x5, x5, #32 __LF                   \
+        lsr     x9, x9, #32 __LF                   \
+        umaddl  x9, w9, w10, x5 __LF               \
+        mov     x5, x12 __LF                       \
+        lsr     x13, x9, #31 __LF                  \
+        mov     x11, #0x13 __LF                    \
+        umull   x11, w11, w13 __LF                 \
+        add     x2, x2, x11 __LF                   \
+        adds    x2, x2, x6, lsl #32 __LF           \
+        extr    x10, x7, x6, #32 __LF              \
+        adcs    x3, x3, x10 __LF                   \
+        extr    x10, x8, x7, #32 __LF              \
+        adcs    x4, x4, x10 __LF                   \
+        extr    x10, x9, x8, #32 __LF              \
+        lsl     x11, x13, #63 __LF                 \
+        eor     x5, x5, x11 __LF                   \
+        adc     x5, x5, x10 __LF                   \
+        stp     x2, x3, [P0] __LF                  \
         stp     x4, x5, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(P0,P1,P2)                    \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [P0];                   \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
         stp     x7, x8, [P0+16]
 
 // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
@@ -629,59 +629,59 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 #define double_twice4(P0,P1)                    \
-        ldp     x3, x4, [P1];                   \
-        adds    x3, x3, x3;                     \
-        adcs    x4, x4, x4;                     \
-        ldp     x5, x6, [P1+16];                \
-        adcs    x5, x5, x5;                     \
-        adcs    x6, x6, x6;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 // Load the constant k_25519 = 2 * d_25519 using immediate operations
 
 #define load_k25519(P0)                         \
-        movz    x0, #0xf159;                    \
-        movz    x1, #0xb156;                    \
-        movz    x2, #0xd130;                    \
-        movz    x3, #0xfce7;                    \
-        movk    x0, #0x26b2, lsl #16;           \
-        movk    x1, #0x8283, lsl #16;           \
-        movk    x2, #0xeef3, lsl #16;           \
-        movk    x3, #0x56df, lsl #16;           \
-        movk    x0, #0x9b94, lsl #32;           \
-        movk    x1, #0x149a, lsl #32;           \
-        movk    x2, #0x80f2, lsl #32;           \
-        movk    x3, #0xd9dc, lsl #32;           \
-        movk    x0, #0xebd6, lsl #48;           \
-        movk    x1, #0x00e0, lsl #48;           \
-        movk    x2, #0x198e, lsl #48;           \
-        movk    x3, #0x2406, lsl #48;           \
-        stp     x0, x1, [P0];                   \
+        movz    x0, #0xf159 __LF                   \
+        movz    x1, #0xb156 __LF                   \
+        movz    x2, #0xd130 __LF                   \
+        movz    x3, #0xfce7 __LF                   \
+        movk    x0, #0x26b2, lsl #16 __LF          \
+        movk    x1, #0x8283, lsl #16 __LF          \
+        movk    x2, #0xeef3, lsl #16 __LF          \
+        movk    x3, #0x56df, lsl #16 __LF          \
+        movk    x0, #0x9b94, lsl #32 __LF          \
+        movk    x1, #0x149a, lsl #32 __LF          \
+        movk    x2, #0x80f2, lsl #32 __LF          \
+        movk    x3, #0xd9dc, lsl #32 __LF          \
+        movk    x0, #0xebd6, lsl #48 __LF          \
+        movk    x1, #0x00e0, lsl #48 __LF          \
+        movk    x2, #0x198e, lsl #48 __LF          \
+        movk    x3, #0x2406, lsl #48 __LF          \
+        stp     x0, x1, [P0] __LF                  \
         stp     x2, x3, [P0+16]
 
 S2N_BN_SYMBOL(edwards25519_scalarmuldouble):
diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble_alt.S
similarity index 81%
rename from third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble_alt.S
index 9c3d6db2cb5..6df13a937bd 100644
--- a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble_alt.S
@@ -99,213 +99,213 @@
 // Load 64-bit immediate into a register
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 // Macro wrapping up the basic field operation bignum_mul_p25519_alt, only
 // trivially different from a pure function call to that subroutine.
 
 #define mul_p25519(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        orr     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        madd    x11, x7, x8, x7;                \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adcs    x15, x15, xzr;                  \
-        csel    x7, x7, xzr, cc;                \
-        subs    x12, x12, x7;                   \
-        sbcs    x13, x13, xzr;                  \
-        sbcs    x14, x14, xzr;                  \
-        sbc     x15, x15, xzr;                  \
-        and     x15, x15, #0x7fffffffffffffff;  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        orr     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        madd    x11, x7, x8, x7 __LF               \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x15, x15, xzr __LF                 \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        and     x15, x15, #0x7fffffffffffffff __LF \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // A version of multiplication that only guarantees output < 2 * p_25519.
 // This basically skips the +1 and final correction in quotient estimation.
 
 #define mul_4(P0,P1,P2)                         \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        mul     x12, x3, x7;                    \
-        umulh   x13, x3, x7;                    \
-        mul     x11, x3, x8;                    \
-        umulh   x14, x3, x8;                    \
-        adds    x13, x13, x11;                  \
-        ldp     x9, x10, [P2+16];               \
-        mul     x11, x3, x9;                    \
-        umulh   x15, x3, x9;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x3, x10;                   \
-        umulh   x16, x3, x10;                   \
-        adcs    x15, x15, x11;                  \
-        adc     x16, x16, xzr;                  \
-        ldp     x5, x6, [P1+16];                \
-        mul     x11, x4, x7;                    \
-        adds    x13, x13, x11;                  \
-        mul     x11, x4, x8;                    \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x4, x9;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x4, x10;                   \
-        adcs    x16, x16, x11;                  \
-        umulh   x3, x4, x10;                    \
-        adc     x3, x3, xzr;                    \
-        umulh   x11, x4, x7;                    \
-        adds    x14, x14, x11;                  \
-        umulh   x11, x4, x8;                    \
-        adcs    x15, x15, x11;                  \
-        umulh   x11, x4, x9;                    \
-        adcs    x16, x16, x11;                  \
-        adc     x3, x3, xzr;                    \
-        mul     x11, x5, x7;                    \
-        adds    x14, x14, x11;                  \
-        mul     x11, x5, x8;                    \
-        adcs    x15, x15, x11;                  \
-        mul     x11, x5, x9;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x5, x10;                   \
-        adcs    x3, x3, x11;                    \
-        umulh   x4, x5, x10;                    \
-        adc     x4, x4, xzr;                    \
-        umulh   x11, x5, x7;                    \
-        adds    x15, x15, x11;                  \
-        umulh   x11, x5, x8;                    \
-        adcs    x16, x16, x11;                  \
-        umulh   x11, x5, x9;                    \
-        adcs    x3, x3, x11;                    \
-        adc     x4, x4, xzr;                    \
-        mul     x11, x6, x7;                    \
-        adds    x15, x15, x11;                  \
-        mul     x11, x6, x8;                    \
-        adcs    x16, x16, x11;                  \
-        mul     x11, x6, x9;                    \
-        adcs    x3, x3, x11;                    \
-        mul     x11, x6, x10;                   \
-        adcs    x4, x4, x11;                    \
-        umulh   x5, x6, x10;                    \
-        adc     x5, x5, xzr;                    \
-        umulh   x11, x6, x7;                    \
-        adds    x16, x16, x11;                  \
-        umulh   x11, x6, x8;                    \
-        adcs    x3, x3, x11;                    \
-        umulh   x11, x6, x9;                    \
-        adcs    x4, x4, x11;                    \
-        adc     x5, x5, xzr;                    \
-        mov     x7, #0x26;                      \
-        mul     x11, x7, x16;                   \
-        umulh   x9, x7, x16;                    \
-        adds    x12, x12, x11;                  \
-        mul     x11, x7, x3;                    \
-        umulh   x3, x7, x3;                     \
-        adcs    x13, x13, x11;                  \
-        mul     x11, x7, x4;                    \
-        umulh   x4, x7, x4;                     \
-        adcs    x14, x14, x11;                  \
-        mul     x11, x7, x5;                    \
-        umulh   x5, x7, x5;                     \
-        adcs    x15, x15, x11;                  \
-        cset    x16, cs;                        \
-        adds    x15, x15, x4;                   \
-        adc     x16, x16, x5;                   \
-        cmn     x15, x15;                       \
-        bic     x15, x15, #0x8000000000000000;  \
-        adc     x8, x16, x16;                   \
-        mov     x7, #0x13;                      \
-        mul     x11, x7, x8;                    \
-        adds    x12, x12, x11;                  \
-        adcs    x13, x13, x9;                   \
-        adcs    x14, x14, x3;                   \
-        adc     x15, x15, xzr;                  \
-        stp     x12, x13, [P0];                 \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x15, x3, x9 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x16, x3, x10 __LF                  \
+        adcs    x15, x15, x11 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x26 __LF                     \
+        mul     x11, x7, x16 __LF                  \
+        umulh   x9, x7, x16 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x15, x15, x11 __LF                 \
+        cset    x16, cs __LF                       \
+        adds    x15, x15, x4 __LF                  \
+        adc     x16, x16, x5 __LF                  \
+        cmn     x15, x15 __LF                      \
+        bic     x15, x15, #0x8000000000000000 __LF \
+        adc     x8, x16, x16 __LF                  \
+        mov     x7, #0x13 __LF                     \
+        mul     x11, x7, x8 __LF                   \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adc     x15, x15, xzr __LF                 \
+        stp     x12, x13, [P0] __LF                \
         stp     x14, x15, [P0+16]
 
 // Squaring just giving a result < 2 * p_25519, which is done by
@@ -313,97 +313,97 @@
 // optional correction.
 
 #define sqr_4(P0,P1)                            \
-        ldp     x2, x3, [P1];                   \
-        mul     x9, x2, x3;                     \
-        umulh   x10, x2, x3;                    \
-        ldp     x4, x5, [P1+16];                \
-        mul     x11, x2, x5;                    \
-        umulh   x12, x2, x5;                    \
-        mul     x7, x2, x4;                     \
-        umulh   x6, x2, x4;                     \
-        adds    x10, x10, x7;                   \
-        adcs    x11, x11, x6;                   \
-        mul     x7, x3, x4;                     \
-        umulh   x6, x3, x4;                     \
-        adc     x6, x6, xzr;                    \
-        adds    x11, x11, x7;                   \
-        mul     x13, x4, x5;                    \
-        umulh   x14, x4, x5;                    \
-        adcs    x12, x12, x6;                   \
-        mul     x7, x3, x5;                     \
-        umulh   x6, x3, x5;                     \
-        adc     x6, x6, xzr;                    \
-        adds    x12, x12, x7;                   \
-        adcs    x13, x13, x6;                   \
-        adc     x14, x14, xzr;                  \
-        adds    x9, x9, x9;                     \
-        adcs    x10, x10, x10;                  \
-        adcs    x11, x11, x11;                  \
-        adcs    x12, x12, x12;                  \
-        adcs    x13, x13, x13;                  \
-        adcs    x14, x14, x14;                  \
-        cset    x6, cs;                         \
-        umulh   x7, x2, x2;                     \
-        mul     x8, x2, x2;                     \
-        adds    x9, x9, x7;                     \
-        mul     x7, x3, x3;                     \
-        adcs    x10, x10, x7;                   \
-        umulh   x7, x3, x3;                     \
-        adcs    x11, x11, x7;                   \
-        mul     x7, x4, x4;                     \
-        adcs    x12, x12, x7;                   \
-        umulh   x7, x4, x4;                     \
-        adcs    x13, x13, x7;                   \
-        mul     x7, x5, x5;                     \
-        adcs    x14, x14, x7;                   \
-        umulh   x7, x5, x5;                     \
-        adc     x6, x6, x7;                     \
-        mov     x3, #0x26;                      \
-        mul     x7, x3, x12;                    \
-        umulh   x4, x3, x12;                    \
-        adds    x8, x8, x7;                     \
-        mul     x7, x3, x13;                    \
-        umulh   x13, x3, x13;                   \
-        adcs    x9, x9, x7;                     \
-        mul     x7, x3, x14;                    \
-        umulh   x14, x3, x14;                   \
-        adcs    x10, x10, x7;                   \
-        mul     x7, x3, x6;                     \
-        umulh   x6, x3, x6;                     \
-        adcs    x11, x11, x7;                   \
-        cset    x12, cs;                        \
-        adds    x11, x11, x14;                  \
-        adc     x12, x12, x6;                   \
-        cmn     x11, x11;                       \
-        bic     x11, x11, #0x8000000000000000;  \
-        adc     x2, x12, x12;                   \
-        mov     x3, #0x13;                      \
-        mul     x7, x3, x2;                     \
-        adds    x8, x8, x7;                     \
-        adcs    x9, x9, x4;                     \
-        adcs    x10, x10, x13;                  \
-        adc     x11, x11, xzr;                  \
-        stp     x8, x9, [P0];                   \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x26 __LF                     \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        cmn     x11, x11 __LF                      \
+        bic     x11, x11, #0x8000000000000000 __LF \
+        adc     x2, x12, x12 __LF                  \
+        mov     x3, #0x13 __LF                     \
+        mul     x7, x3, x2 __LF                    \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
         stp     x10, x11, [P0+16]
 
 // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
 
 #define sub_twice4(P0,P1,P2)                    \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        mov     x4, #38;                        \
-        csel    x3, x4, xzr, lo;                \
-        subs    x5, x5, x3;                     \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbc     x8, x8, xzr;                    \
-        stp     x5, x6, [P0];                   \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #38 __LF                       \
+        csel    x3, x4, xzr, lo __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
         stp     x7, x8, [P0+16]
 
 // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
@@ -413,59 +413,59 @@
 // at least one of them is reduced double modulo.
 
 #define add_twice4(P0,P1,P2)                    \
-        ldp     x3, x4, [P1];                   \
-        ldp     x7, x8, [P2];                   \
-        adds    x3, x3, x7;                     \
-        adcs    x4, x4, x8;                     \
-        ldp     x5, x6, [P1+16];                \
-        ldp     x7, x8, [P2+16];                \
-        adcs    x5, x5, x7;                     \
-        adcs    x6, x6, x8;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2+16] __LF               \
+        adcs    x5, x5, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 #define double_twice4(P0,P1)                    \
-        ldp     x3, x4, [P1];                   \
-        adds    x3, x3, x3;                     \
-        adcs    x4, x4, x4;                     \
-        ldp     x5, x6, [P1+16];                \
-        adcs    x5, x5, x5;                     \
-        adcs    x6, x6, x6;                     \
-        mov     x9, #38;                        \
-        csel    x9, x9, xzr, cs;                \
-        adds    x3, x3, x9;                     \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adc     x6, x6, xzr;                    \
-        stp     x3, x4, [P0];                   \
+        ldp     x3, x4, [P1] __LF                  \
+        adds    x3, x3, x3 __LF                    \
+        adcs    x4, x4, x4 __LF                    \
+        ldp     x5, x6, [P1+16] __LF               \
+        adcs    x5, x5, x5 __LF                    \
+        adcs    x6, x6, x6 __LF                    \
+        mov     x9, #38 __LF                       \
+        csel    x9, x9, xzr, cs __LF               \
+        adds    x3, x3, x9 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        stp     x3, x4, [P0] __LF                  \
         stp     x5, x6, [P0+16]
 
 // Load the constant k_25519 = 2 * d_25519 using immediate operations
 
 #define load_k25519(P0)                         \
-        movz    x0, #0xf159;                    \
-        movz    x1, #0xb156;                    \
-        movz    x2, #0xd130;                    \
-        movz    x3, #0xfce7;                    \
-        movk    x0, #0x26b2, lsl #16;           \
-        movk    x1, #0x8283, lsl #16;           \
-        movk    x2, #0xeef3, lsl #16;           \
-        movk    x3, #0x56df, lsl #16;           \
-        movk    x0, #0x9b94, lsl #32;           \
-        movk    x1, #0x149a, lsl #32;           \
-        movk    x2, #0x80f2, lsl #32;           \
-        movk    x3, #0xd9dc, lsl #32;           \
-        movk    x0, #0xebd6, lsl #48;           \
-        movk    x1, #0x00e0, lsl #48;           \
-        movk    x2, #0x198e, lsl #48;           \
-        movk    x3, #0x2406, lsl #48;           \
-        stp     x0, x1, [P0];                   \
+        movz    x0, #0xf159 __LF                   \
+        movz    x1, #0xb156 __LF                   \
+        movz    x2, #0xd130 __LF                   \
+        movz    x3, #0xfce7 __LF                   \
+        movk    x0, #0x26b2, lsl #16 __LF          \
+        movk    x1, #0x8283, lsl #16 __LF          \
+        movk    x2, #0xeef3, lsl #16 __LF          \
+        movk    x3, #0x56df, lsl #16 __LF          \
+        movk    x0, #0x9b94, lsl #32 __LF          \
+        movk    x1, #0x149a, lsl #32 __LF          \
+        movk    x2, #0x80f2, lsl #32 __LF          \
+        movk    x3, #0xd9dc, lsl #32 __LF          \
+        movk    x0, #0xebd6, lsl #48 __LF          \
+        movk    x1, #0x00e0, lsl #48 __LF          \
+        movk    x2, #0x198e, lsl #48 __LF          \
+        movk    x3, #0x2406, lsl #48 __LF          \
+        stp     x0, x1, [P0] __LF                  \
         stp     x2, x3, [P0+16]
 
 S2N_BN_SYMBOL(edwards25519_scalarmuldouble_alt):
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/Makefile
new file mode 100644
index 00000000000..10f922e94dc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/Makefile
@@ -0,0 +1,47 @@
+#############################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+#############################################################################
+
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
+# use a cross-assembling version so that the code can still be assembled
+# and the proofs checked against the object files (though you won't be able
+# to run code without additional emulation infrastructure). The aarch64
+# cross-assembling version can be installed manually by something like:
+#
+#  sudo apt-get install binutils-aarch64-linux-gnu
+
+UNAME_RESULT=$(shell uname -p)
+
+ifeq ($(UNAME_RESULT),aarch64)
+GAS=as
+else
+GAS=aarch64-linux-gnu-as
+endif
+
+# List of object files
+
+OBJ = bignum_emontredc_8n.o \
+      bignum_emontredc_8n_cdiff.o \
+      bignum_kmul_16_32.o \
+      bignum_kmul_32_64.o \
+      bignum_ksqr_16_32.o \
+      bignum_ksqr_32_64.o \
+      bignum_mul_4_8.o \
+      bignum_mul_4_8_alt.o \
+      bignum_mul_6_12.o \
+      bignum_mul_6_12_alt.o \
+      bignum_mul_8_16.o \
+      bignum_mul_8_16_alt.o \
+      bignum_sqr_4_8.o \
+      bignum_sqr_4_8_alt.o \
+      bignum_sqr_6_12.o \
+      bignum_sqr_6_12_alt.o \
+      bignum_sqr_8_16.o \
+      bignum_sqr_8_16_alt.o \
+
+%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ -
+
+default: $(OBJ);
+
+clean:; rm -f *.o *.correct
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n.S
similarity index 72%
rename from third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n.S
index 3e72ebd67fa..19dc363f13c 100644
--- a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n.S
@@ -5,23 +5,25 @@
 // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer
 // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
 //
-//    extern uint64_t bignum_emontredc_8n_neon
+//    extern uint64_t bignum_emontredc_8n
 //     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
 //
 // Functionally equivalent to bignum_emontredc (see that file for more detail).
 // But in general assumes that the input k is a multiple of 8.
+// bignum_emontredc_8n is a vectorized version of
+// unopt/bignum_emontredc_8n_base.
 //
 // Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, returns X0
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-					S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_neon)
-					S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_neon)
-					.text
-					.balign 4
+                                        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n)
+                                        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n)
+                                        .text
+                                        .balign 4
 
 
-S2N_BN_SYMBOL(bignum_emontredc_8n_neon):
+S2N_BN_SYMBOL(bignum_emontredc_8n):
            stp x19, x20, [sp, #-16]!
            stp x21, x22, [sp, #-16]!
            stp x23, x24, [sp, #-16]!
@@ -31,14 +33,14 @@ S2N_BN_SYMBOL(bignum_emontredc_8n_neon):
            lsr x0, x0, #2
            mov x26, x0
            subs x12, x0, #1
-           bcc bignum_emontredc_8n_neon_end
+           bcc bignum_emontredc_8n_end
 
            stp x3, xzr, [sp]
            stp x26, xzr, [sp, #16]
            mov x28, xzr
            lsl x0, x12, #5
 
-bignum_emontredc_8n_neon_outerloop:
+bignum_emontredc_8n_outerloop:
           ldp x3, xzr, [sp]
           ldp x17, x19, [x1]
           ldp x20, x21, [x1, #16]
@@ -54,10 +56,10 @@ dup v0.2d, x4
 uzp2    v3.4s, v21.4s, v0.4s
 xtn     v4.2s, v0.2d
 xtn     v5.2s, v21.2d
-          mul	x12, x4, x8
+          mul   x12, x4, x8
           adds x17, x17, x12
           umulh x12, x4, x8
-          mul	x13, x4, x9
+          mul   x13, x4, x9
 rev64   v1.4s, v21.4s
 umull   v6.2d, v4.2s, v5.2s
 umull   v7.2d, v4.2s, v3.2s
@@ -96,10 +98,10 @@ uzp2    v3.4s, v21.4s, v0.4s
 xtn     v4.2s, v0.2d
 xtn     v5.2s, v21.2d
 
-          mul	x12, x5, x8
-          adds	x19, x19, x12
-          umulh	x12, x5, x8
-          mul	x13, x5, x9
+          mul   x12, x5, x8
+          adds  x19, x19, x12
+          umulh x12, x5, x8
+          mul   x13, x5, x9
 
 rev64   v1.4s, v21.4s
 umull   v6.2d, v4.2s, v5.2s
@@ -117,8 +119,8 @@ usra    v1.2d, v7.2d, #32
 umlal   v0.2d, v4.2s, v5.2s
 mov x14, v0.d[0]
 mov x15, v0.d[1]
-          adcs	x20, x20, x13
-          umulh	x13, x5, x9
+          adcs  x20, x20, x13
+          umulh x13, x5, x9
           adcs x21, x21, x14
 usra    v1.2d, v2.2d, #32
 mov x14, v1.d[0]
@@ -126,7 +128,7 @@ mov x14, v1.d[0]
 mov x15, v1.d[1]
           adc x23, xzr, xzr
           adds x20, x20, x12
-          mul	x6, x20, x3 // hoisted from step 2
+          mul   x6, x20, x3 // hoisted from step 2
 
 // NEON: For montgomery step 2,
 // calculate x6 * (x10, x11) that does two 64x64->128-bit multiplications.
@@ -145,7 +147,7 @@ xtn     v5.2s, in2.2d
 
           stp x4, x5, [x1]
 
-// hoisted from maddloop_neon_firstitr
+// hoisted from maddloop_firstitr
 ldr q20, [x1]
 // q21 will be loaded later.
 
@@ -159,24 +161,24 @@ umull   v6.2d, v4.2s, v5.2s
 umull   v7.2d, v4.2s, v3.2s
 uzp2    v16.4s, in1.4s, in1.4s
 
-        mul	x12, x6, x8
-        adds	x20, x20, x12
+        mul     x12, x6, x8
+        adds    x20, x20, x12
 
 mul     v0.4s, v1.4s, in1.4s
 movi    v2.2d, #0x000000ffffffff
 usra    v7.2d, v6.2d, #32
 umull   out_hi.2d, v16.2s, v3.2s
 
-        umulh	x12, x6, x8
-        mul	x13, x6, x9
+        umulh   x12, x6, x8
+        mul     x13, x6, x9
 
 uaddlp  v0.2d, v0.4s
 and     v2.16b, v7.16b, v2.16b
 umlal   v2.2d, v16.2s, v5.2s
 shl     out_lo.2d, v0.2d, #32
 
-        adcs	x21, x21, x13
-        umulh	x13, x6, x9
+        adcs    x21, x21, x13
+        umulh   x13, x6, x9
 
 usra    out_hi.2d, v7.2d, #32
 umlal   out_lo.2d, v4.2s, v5.2s
@@ -195,19 +197,19 @@ usra    out_hi.2d, v2.2d, #32
 mov x14, v1.d[0]
 mov x15, v1.d[1]
 
-          adc	x24, xzr, xzr
-          adds	x21, x21, x12
-          mul	x7, x21, x3
-          adcs	x22, x22, x13
-          adcs	x23, x23, x14
-          adc	x24, x24, x15
+          adc   x24, xzr, xzr
+          adds  x21, x21, x12
+          mul   x7, x21, x3
+          adcs  x22, x22, x13
+          adcs  x23, x23, x14
+          adc   x24, x24, x15
 
           stp x6, x7, [x1, #16]
 
-// hoisted from maddloop_neon_firstitr
+// hoisted from maddloop_firstitr
 ldr q21, [x1, #16]
 
-// pre-calculate 2mul+2umulhs in maddloop_neon_firstitr
+// pre-calculate 2mul+2umulhs in maddloop_firstitr
 // v25++v24 = hi and lo of (x4 * x8, x5 * x9)
 #define in1  v20
 #define in2  v22
@@ -218,16 +220,16 @@ xtn     v4.2s, in1.2d
 
           // Montgomery step 3
 
-           mul	x12, x7, x8
-           mul	x13, x7, x9
+           mul  x12, x7, x8
+           mul  x13, x7, x9
 
 xtn     v5.2s, in2.2d
 rev64   v1.4s, in2.4s
 umull   v6.2d, v4.2s, v5.2s
 umull   v7.2d, v4.2s, v3.2s
 
-           mul	x14, x7, x10
-           mul	x15, x7, x11
+           mul  x14, x7, x10
+           mul  x15, x7, x11
 
 uzp2    v16.4s, in1.4s, in1.4s
 mul     v0.4s, v1.4s, in1.4s
@@ -238,10 +240,10 @@ uaddlp  v0.2d, v0.4s
 and     v2.16b, v7.16b, v2.16b
 umlal   v2.2d, v16.2s, v5.2s
 
-           adds	x21, x21, x12
-           umulh	x12, x7, x8
-           adcs	x22, x22, x13
-           umulh	x13, x7, x9
+           adds x21, x21, x12
+           umulh        x12, x7, x8
+           adcs x22, x22, x13
+           umulh        x13, x7, x9
 
 shl     out_lo.2d, v0.2d, #32
 usra    out_hi.2d, v7.2d, #32
@@ -252,10 +254,10 @@ usra    out_hi.2d, v2.2d, #32
 #undef out_lo
 #undef out_hi
 
-           adcs	x23, x23, x14
-           umulh	x14, x7, x10
-           adcs	x24, x24, x15
-           umulh	x15, x7, x11
+           adcs x23, x23, x14
+           umulh        x14, x7, x10
+           adcs x24, x24, x15
+           umulh        x15, x7, x11
 
 // v27++v26 = hi and lo of (x6 * x10, x7 * x11)
 #define in1  v21
@@ -267,7 +269,7 @@ xtn     v4.2s, in1.2d
 xtn     v5.2s, in2.2d
 rev64   v1.4s, in2.4s
 
-// hoisted from maddloop_neon_firstitr and maddloop_x0one
+// hoisted from maddloop_firstitr and maddloop_x0one
           ldp x8, x9, [x2, #32]
           ldp x10, x11, [x2, #48]
 
@@ -276,11 +278,11 @@ umull   v7.2d, v4.2s, v3.2s
 uzp2    v16.4s, in1.4s, in1.4s
 mul     v0.4s, v1.4s, in1.4s
 
-          adc	x25, xzr, xzr
-          adds	x12, x22, x12
-          adcs	x13, x23, x13
-          adcs	x14, x24, x14
-          adc	x15, x25, x15
+          adc   x25, xzr, xzr
+          adds  x12, x22, x12
+          adcs  x13, x23, x13
+          adcs  x14, x24, x14
+          adc   x15, x25, x15
 
 movi    v2.2d, #0x000000ffffffff
 usra    v7.2d, v6.2d, #32
@@ -297,156 +299,156 @@ usra    out_hi.2d, v2.2d, #32
 #undef out_lo
 #undef out_hi
 
-          cbz x0, bignum_emontredc_8n_neon_madddone
+          cbz x0, bignum_emontredc_8n_madddone
           mov x27, x0
           cmp x0, #32
-          bne bignum_emontredc_8n_neon_maddloop_neon_firstitr
-
-bignum_emontredc_8n_neon_maddloop_x0one:
-         	add	x2, x2, #0x20
-         	add	x1, x1, #0x20
-         	mul	x17, x4, x8
-         	mul	x22, x5, x9
-         	mul	x23, x6, x10
-         	mul	x24, x7, x11
-         	umulh	x16, x4, x8
-         	adds	x22, x22, x16
-         	umulh	x16, x5, x9
-         	adcs	x23, x23, x16
-         	umulh	x16, x6, x10
-         	adcs	x24, x24, x16
-         	umulh	x16, x7, x11
-         	adc	x25, x16, xzr
-         	ldp	x20, x21, [x1]
-         	adds	x12, x12, x20
-         	adcs	x13, x13, x21
-         	ldp	x20, x21, [x1, #16]
-         	adcs	x14, x14, x20
-         	adcs	x15, x15, x21
-         	adc	x16, xzr, xzr
-         	adds	x19, x22, x17
-         	adcs	x22, x23, x22
-         	adcs	x23, x24, x23
-         	adcs	x24, x25, x24
-         	adc	x25, xzr, x25
-         	adds	x20, x22, x17
-         	adcs	x21, x23, x19
-         	adcs	x22, x24, x22
-         	adcs	x23, x25, x23
-         	adcs	x24, xzr, x24
-         	adc	x25, xzr, x25
-         	adds	x17, x17, x12
-         	adcs	x19, x19, x13
-         	adcs	x20, x20, x14
-         	adcs	x21, x21, x15
-         	adcs	x22, x22, x16
-         	adcs	x23, x23, xzr
-         	adcs	x24, x24, xzr
-         	adc	x25, x25, xzr
-         	subs	x15, x6, x7
-         	cneg	x15, x15, cc  // cc = lo, ul, last
-         	csetm	x12, cc  // cc = lo, ul, last
-         	subs	x13, x11, x10
-         	cneg	x13, x13, cc  // cc = lo, ul, last
-         	mul	x14, x15, x13
-         	umulh	x13, x15, x13
-         	cinv	x12, x12, cc  // cc = lo, ul, last
-         	cmn	x12, #0x1
-         	eor	x14, x14, x12
-         	adcs	x23, x23, x14
-         	eor	x13, x13, x12
-         	adcs	x24, x24, x13
-         	adc	x25, x25, x12
-         	subs	x15, x4, x5
-         	cneg	x15, x15, cc  // cc = lo, ul, last
-         	csetm	x12, cc  // cc = lo, ul, last
-         	subs	x13, x9, x8
-         	cneg	x13, x13, cc  // cc = lo, ul, last
-         	mul	x14, x15, x13
-         	umulh	x13, x15, x13
-         	cinv	x12, x12, cc  // cc = lo, ul, last
-         	cmn	x12, #0x1
-         	eor	x14, x14, x12
-         	adcs	x19, x19, x14
-         	eor	x13, x13, x12
-         	adcs	x20, x20, x13
-         	adcs	x21, x21, x12
-         	adcs	x22, x22, x12
-         	adcs	x23, x23, x12
-         	adcs	x24, x24, x12
-         	adc	x25, x25, x12
-         	subs	x15, x5, x7
-         	cneg	x15, x15, cc  // cc = lo, ul, last
-         	csetm	x12, cc  // cc = lo, ul, last
-         	subs	x13, x11, x9
-         	cneg	x13, x13, cc  // cc = lo, ul, last
-         	mul	x14, x15, x13
-         	umulh	x13, x15, x13
-         	cinv	x12, x12, cc  // cc = lo, ul, last
-         	cmn	x12, #0x1
-         	eor	x14, x14, x12
-         	adcs	x22, x22, x14
-         	eor	x13, x13, x12
-         	adcs	x23, x23, x13
-         	adcs	x24, x24, x12
-         	adc	x25, x25, x12
-         	subs	x15, x4, x6
-         	cneg	x15, x15, cc  // cc = lo, ul, last
-         	csetm	x12, cc  // cc = lo, ul, last
-         	subs	x13, x10, x8
-         	cneg	x13, x13, cc  // cc = lo, ul, last
-         	mul	x14, x15, x13
-         	umulh	x13, x15, x13
-         	cinv	x12, x12, cc  // cc = lo, ul, last
-         	cmn	x12, #0x1
-         	eor	x14, x14, x12
-         	adcs	x20, x20, x14
-         	eor	x13, x13, x12
-         	adcs	x21, x21, x13
-         	adcs	x22, x22, x12
-         	adcs	x23, x23, x12
-         	adcs	x24, x24, x12
-         	adc	x25, x25, x12
-         	subs	x15, x4, x7
-         	cneg	x15, x15, cc  // cc = lo, ul, last
-         	csetm	x12, cc  // cc = lo, ul, last
-         	subs	x13, x11, x8
-         	cneg	x13, x13, cc  // cc = lo, ul, last
-         	mul	x14, x15, x13
-         	umulh	x13, x15, x13
-         	cinv	x12, x12, cc  // cc = lo, ul, last
-         	cmn	x12, #0x1
-         	eor	x14, x14, x12
-         	adcs	x21, x21, x14
-         	eor	x13, x13, x12
-         	adcs	x22, x22, x13
-         	adcs	x23, x23, x12
-         	adcs	x24, x24, x12
-         	adc	x25, x25, x12
-         	subs	x15, x5, x6
-         	cneg	x15, x15, cc  // cc = lo, ul, last
-         	csetm	x12, cc  // cc = lo, ul, last
-         	subs	x13, x10, x9
-         	cneg	x13, x13, cc  // cc = lo, ul, last
-         	mul	x14, x15, x13
-         	umulh	x13, x15, x13
-         	cinv	x12, x12, cc  // cc = lo, ul, last
-         	cmn	x12, #0x1
-         	eor	x14, x14, x12
-         	adcs	x21, x21, x14
-         	eor	x13, x13, x12
-         	adcs	x22, x22, x13
-         	adcs	x13, x23, x12
-         	adcs	x14, x24, x12
-         	adc	x15, x25, x12
-         	mov	x12, x22
-         	stp	x17, x19, [x1]
-         	stp	x20, x21, [x1, #16]
-         	sub	x27, x27, #0x20
-          b bignum_emontredc_8n_neon_madddone
-
-
-bignum_emontredc_8n_neon_maddloop_neon_firstitr:
+          bne bignum_emontredc_8n_maddloop_firstitr
+
+bignum_emontredc_8n_maddloop_x0one:
+                add     x2, x2, #0x20
+                add     x1, x1, #0x20
+                mul     x17, x4, x8
+                mul     x22, x5, x9
+                mul     x23, x6, x10
+                mul     x24, x7, x11
+                umulh   x16, x4, x8
+                adds    x22, x22, x16
+                umulh   x16, x5, x9
+                adcs    x23, x23, x16
+                umulh   x16, x6, x10
+                adcs    x24, x24, x16
+                umulh   x16, x7, x11
+                adc     x25, x16, xzr
+                ldp     x20, x21, [x1]
+                adds    x12, x12, x20
+                adcs    x13, x13, x21
+                ldp     x20, x21, [x1, #16]
+                adcs    x14, x14, x20
+                adcs    x15, x15, x21
+                adc     x16, xzr, xzr
+                adds    x19, x22, x17
+                adcs    x22, x23, x22
+                adcs    x23, x24, x23
+                adcs    x24, x25, x24
+                adc     x25, xzr, x25
+                adds    x20, x22, x17
+                adcs    x21, x23, x19
+                adcs    x22, x24, x22
+                adcs    x23, x25, x23
+                adcs    x24, xzr, x24
+                adc     x25, xzr, x25
+                adds    x17, x17, x12
+                adcs    x19, x19, x13
+                adcs    x20, x20, x14
+                adcs    x21, x21, x15
+                adcs    x22, x22, x16
+                adcs    x23, x23, xzr
+                adcs    x24, x24, xzr
+                adc     x25, x25, xzr
+                subs    x15, x6, x7
+                cneg    x15, x15, cc  // cc = lo, ul, last
+                csetm   x12, cc  // cc = lo, ul, last
+                subs    x13, x11, x10
+                cneg    x13, x13, cc  // cc = lo, ul, last
+                mul     x14, x15, x13
+                umulh   x13, x15, x13
+                cinv    x12, x12, cc  // cc = lo, ul, last
+                cmn     x12, #0x1
+                eor     x14, x14, x12
+                adcs    x23, x23, x14
+                eor     x13, x13, x12
+                adcs    x24, x24, x13
+                adc     x25, x25, x12
+                subs    x15, x4, x5
+                cneg    x15, x15, cc  // cc = lo, ul, last
+                csetm   x12, cc  // cc = lo, ul, last
+                subs    x13, x9, x8
+                cneg    x13, x13, cc  // cc = lo, ul, last
+                mul     x14, x15, x13
+                umulh   x13, x15, x13
+                cinv    x12, x12, cc  // cc = lo, ul, last
+                cmn     x12, #0x1
+                eor     x14, x14, x12
+                adcs    x19, x19, x14
+                eor     x13, x13, x12
+                adcs    x20, x20, x13
+                adcs    x21, x21, x12
+                adcs    x22, x22, x12
+                adcs    x23, x23, x12
+                adcs    x24, x24, x12
+                adc     x25, x25, x12
+                subs    x15, x5, x7
+                cneg    x15, x15, cc  // cc = lo, ul, last
+                csetm   x12, cc  // cc = lo, ul, last
+                subs    x13, x11, x9
+                cneg    x13, x13, cc  // cc = lo, ul, last
+                mul     x14, x15, x13
+                umulh   x13, x15, x13
+                cinv    x12, x12, cc  // cc = lo, ul, last
+                cmn     x12, #0x1
+                eor     x14, x14, x12
+                adcs    x22, x22, x14
+                eor     x13, x13, x12
+                adcs    x23, x23, x13
+                adcs    x24, x24, x12
+                adc     x25, x25, x12
+                subs    x15, x4, x6
+                cneg    x15, x15, cc  // cc = lo, ul, last
+                csetm   x12, cc  // cc = lo, ul, last
+                subs    x13, x10, x8
+                cneg    x13, x13, cc  // cc = lo, ul, last
+                mul     x14, x15, x13
+                umulh   x13, x15, x13
+                cinv    x12, x12, cc  // cc = lo, ul, last
+                cmn     x12, #0x1
+                eor     x14, x14, x12
+                adcs    x20, x20, x14
+                eor     x13, x13, x12
+                adcs    x21, x21, x13
+                adcs    x22, x22, x12
+                adcs    x23, x23, x12
+                adcs    x24, x24, x12
+                adc     x25, x25, x12
+                subs    x15, x4, x7
+                cneg    x15, x15, cc  // cc = lo, ul, last
+                csetm   x12, cc  // cc = lo, ul, last
+                subs    x13, x11, x8
+                cneg    x13, x13, cc  // cc = lo, ul, last
+                mul     x14, x15, x13
+                umulh   x13, x15, x13
+                cinv    x12, x12, cc  // cc = lo, ul, last
+                cmn     x12, #0x1
+                eor     x14, x14, x12
+                adcs    x21, x21, x14
+                eor     x13, x13, x12
+                adcs    x22, x22, x13
+                adcs    x23, x23, x12
+                adcs    x24, x24, x12
+                adc     x25, x25, x12
+                subs    x15, x5, x6
+                cneg    x15, x15, cc  // cc = lo, ul, last
+                csetm   x12, cc  // cc = lo, ul, last
+                subs    x13, x10, x9
+                cneg    x13, x13, cc  // cc = lo, ul, last
+                mul     x14, x15, x13
+                umulh   x13, x15, x13
+                cinv    x12, x12, cc  // cc = lo, ul, last
+                cmn     x12, #0x1
+                eor     x14, x14, x12
+                adcs    x21, x21, x14
+                eor     x13, x13, x12
+                adcs    x22, x22, x13
+                adcs    x13, x23, x12
+                adcs    x14, x24, x12
+                adc     x15, x25, x12
+                mov     x12, x22
+                stp     x17, x19, [x1]
+                stp     x20, x21, [x1, #16]
+                sub     x27, x27, #0x20
+          b bignum_emontredc_8n_madddone
+
+
+bignum_emontredc_8n_maddloop_firstitr:
 
 mov x16, v25.d[0] //umulh x16,x4,x8
 mov x22, v24.d[1] //mul x22, x5, x9
@@ -685,10 +687,10 @@ mov x24, v26.d[1] // lo bits of (x7 * x11)
 
            sub x27, x27, #32
            cmp x27, #32
-           beq bignum_emontredc_8n_neon_maddloop_neon_last
+           beq bignum_emontredc_8n_maddloop_last
 
 
-bignum_emontredc_8n_neon_maddloop_neon:
+bignum_emontredc_8n_maddloop:
           ldp x8, x9, [x2, #32]
           ldp x10, x11, [x2, #48]
 
@@ -918,10 +920,10 @@ mov x24, v26.d[1] // lo bits of (x7 * x11)
 
            sub x27, x27, #32
            cmp x27, #32
-           bne bignum_emontredc_8n_neon_maddloop_neon
+           bne bignum_emontredc_8n_maddloop
 
 
-bignum_emontredc_8n_neon_maddloop_neon_last:
+bignum_emontredc_8n_maddloop_last:
           ldp x8, x9, [x2, #32]
           ldp x10, x11, [x2, #48]
 
@@ -1061,7 +1063,7 @@ mov x17, v24.d[0] // lo bits of (x4 * x8)
           stp x20,x21,[x1,#16]
            subs x27, x27, #64
 
-bignum_emontredc_8n_neon_madddone:
+bignum_emontredc_8n_madddone:
            ldp x17, x19, [x1, #32]
            ldp x20, x21, [x1, #48]
            ldp x26, xzr, [sp, #16]
@@ -1078,10 +1080,10 @@ bignum_emontredc_8n_neon_madddone:
            add x1, x1, #32
            subs x26, x26, #1
            stp x26, xzr, [sp, #16]
-           bne bignum_emontredc_8n_neon_outerloop
+           bne bignum_emontredc_8n_outerloop
            neg x0, x28
 
-bignum_emontredc_8n_neon_end:
+bignum_emontredc_8n_end:
            add sp, sp, #32
 
            ldp x27, x28, [sp], #16
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n_cdiff.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n_cdiff.S
new file mode 100644
index 00000000000..daa52ae40d7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n_cdiff.S
@@ -0,0 +1,656 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Extend Montgomery reduce in 8-digit blocks, uses an extra storage to
+// temporarily cache multiplied differences appearing in ADK.
+// Results are stored in input-output buffer (z).
+// Inputs z[2*k], m[k], w;
+// Outputs function return (extra result bit) and z[2*k]
+// Temporary buffer m_precalc[12*(k/4-1)]
+// k must be divisible by 8 and not smaller than 16.
+//
+//    extern uint64_t bignum_emontredc_8n_cdiff
+//     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w, uint64_t *m_precalc);
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, X4 = m_precalc
+//                   returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_cdiff)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_cdiff)
+        .text
+        .balign 4
+
+#define count x27
+
+// Helper macro for the pre-computations
+#define cdiff(t, c, x, y) subs t, x, y; cneg t, t, cc; csetm c, cc
+
+// Some immediate offsets for cached differences+carry used
+// in the inner ADK multiplications
+#define cache_a01 (32+0*16)
+#define cache_a02 (32+1*16)
+#define cache_a03 (32+2*16)
+#define cache_a12 (32+3*16)
+#define cache_a13 (32+4*16)
+#define cache_a23 (32+5*16)
+#define cache_m10 (0*16)
+#define cache_m20 (1*16)
+#define cache_m30 (2*16)
+#define cache_m21 (3*16)
+#define cache_m31 (4*16)
+#define cache_m32 (5*16)
+
+#define a0 x4
+#define a1 x5
+#define a2 x6
+#define a3 x7
+
+// Registers for precalculation
+#define vpre00 v30
+#define vpre01 v28
+#define vpre02 v17
+#define vpre10 v18
+#define vpre11 v19
+#define vpre12 v20
+
+#define m x2
+
+S2N_BN_SYMBOL(bignum_emontredc_8n_cdiff):
+
+        sub sp, sp, #(10*16)
+        stp x19, x20, [sp, #(9*16)]
+        stp x21, x22, [sp, #(8*16)]
+        stp x23, x24, [sp, #(7*16)]
+        stp x25, x26, [sp, #(6*16)]
+        stp x27, x28, [sp, #(5*16)]
+        stp x29, x30, [sp, #(4*16)]
+        stp d14, d15, [sp, #(3*16)]
+        stp d12, d13, [sp, #(2*16)]
+        stp d10, d11, [sp, #(1*16)]
+        stp d8, d9, [sp, #(0*16)]
+
+        // Leave space for cached differences of words of a in inner loop
+        sub sp, sp, #(6*16)
+
+        sub sp, sp, #32
+        lsr x0, x0, #2
+        mov x26, x0
+        subs x12, x0, #1
+        bcc bignum_emontredc_8n_cdiff_end
+
+        // x30 = buffer holding precomputed ADK carry-differences for modulus
+
+        //
+        // Start of precomputation
+        //
+        // Precompute and cache signed differences of modulus components
+        // used in the ADK multiplication in the inner loop.
+        //
+
+        // Number of extra limbs required:
+        // 6 * (number of limbs / 4 - 1) * 2 = 12 * (number_of_limbs/4 - 1)
+        //
+        mov x24, x4
+        mov x30, x4
+
+        // Save modulus pointer
+        mov x25, m
+
+        mov count, x12
+
+bignum_emontredc_8n_cdiff_precomp:
+        ldp a0, a1, [m, #32]!
+        ldp a2, a3, [m, #16]
+
+#define t x28
+#define c x29
+
+        cdiff(t, c, a1, a0)
+        stp   t, c, [x30, #cache_m10]
+        cdiff(t, c, a2, a0)
+        stp   t, c, [x30, #cache_m20]
+        cdiff(t, c, a3, a0)
+        stp   t, c, [x30, #cache_m30]
+        cdiff(t, c, a2, a1)
+        stp   t, c, [x30, #cache_m21]
+        cdiff(t, c, a3, a1)
+        stp   t, c, [x30, #cache_m31]
+        cdiff(t, c, a3, a2)
+        stp   t, c, [x30, #cache_m32]
+
+        add x30, x30, #(6*16)
+
+        subs count, count, #1
+        cbnz count, bignum_emontredc_8n_cdiff_precomp
+
+        // Set modulus pointer and buffer pointer back to its original value
+        mov m, x25
+        mov x30, x24
+
+        //
+        // End of precomputation
+        //
+
+        stp x3, x30, [sp]
+        stp x26, xzr, [sp, #16]
+        mov x28, xzr
+        lsl x0, x12, #5
+
+        movi    v29.2d, #0x000000ffffffff
+
+bignum_emontredc_8n_cdiff_outerloop:
+        ldp x9, x13, [x1, #0]                      // .*..................................................................................................................................................................................................................
+        ldr x3, [sp]                               // *...................................................................................................................................................................................................................
+        lsr x27, x0, #5                            // ......................................................................................................................................*.............................................................................
+        sub x27, x27, #1                           // ...................................................................................................................................................................................................................*
+        ldp x10, x12, [x1, #16]                    // ..*.................................................................................................................................................................................................................
+        ldp x4, x15, [x2, #0]                      // ...*................................................................................................................................................................................................................
+        ldr q1, [x2, #16]                          // .....*..............................................................................................................................................................................................................
+        mul x11, x9, x3                            // ......*.............................................................................................................................................................................................................
+        uzp2 v18.4S, v1.4S, v1.4S                  // ........*...........................................................................................................................................................................................................
+        dup v27.2D, x11                            // .......*............................................................................................................................................................................................................
+        xtn v13.2S, v1.2D                          // ..........*.........................................................................................................................................................................................................
+        rev64 v9.4S, v1.4S                         // ...........*........................................................................................................................................................................................................
+        mul x7, x11, x4                            // ...........................*........................................................................................................................................................................................
+        rev64 v2.4S, v1.4S                         // ....................................................................................*...............................................................................................................................
+        uzp2 v20.4S, v1.4S, v1.4S                  // .................................................................................*..................................................................................................................................
+        mul v31.4S, v9.4S, v27.4S                  // ...............*....................................................................................................................................................................................................
+        xtn v14.2S, v1.2D                          // ..............................................*.....................................................................................................................................................................
+        uzp2 v21.4S, v1.4S, v1.4S                  // ............................................*.......................................................................................................................................................................
+        umulh x22, x11, x15                        // ................................*...................................................................................................................................................................................
+        xtn v17.2S, v27.2D                         // .........*..........................................................................................................................................................................................................
+        adds x19, x9, x7                           // ............................*.......................................................................................................................................................................................
+        umull v28.2D, v17.2S, v13.2S               // ............*.......................................................................................................................................................................................................
+        umull v26.2D, v17.2S, v18.2S               // .............*......................................................................................................................................................................................................
+        uaddlp v8.2D, v31.4S                       // ..................*.................................................................................................................................................................................................
+        umulh x8, x11, x4                          // .............................*......................................................................................................................................................................................
+        shl v7.2D, v8.2D, #32                      // .....................*..............................................................................................................................................................................................
+        uzp2 v30.4S, v27.4S, v27.4S                // ..............*.....................................................................................................................................................................................................
+        umlal v7.2D, v17.2S, v13.2S                // .......................*............................................................................................................................................................................................
+        mul x14, x11, x15                          // ..............................*.....................................................................................................................................................................................
+        usra v26.2D, v28.2D, #32                   // ................*...................................................................................................................................................................................................
+        umull v12.2D, v30.2S, v18.2S               // .................*..................................................................................................................................................................................................
+        mov x24, v7.d[0]                           // .........................*..........................................................................................................................................................................................
+        adcs x29, x13, x14                         // ...............................*....................................................................................................................................................................................
+        and v4.16B, v26.16B, v29.16B               // ...................*................................................................................................................................................................................................
+        mov x17, v7.d[1]                           // ..........................*.........................................................................................................................................................................................
+        rev64 v27.4S, v1.4S                        // ...............................................*....................................................................................................................................................................
+        adcs x5, x10, x24                          // .................................*..................................................................................................................................................................................
+        umlal v4.2D, v30.2S, v13.2S                // ....................*...............................................................................................................................................................................................
+        usra v12.2D, v26.2D, #32                   // ......................*.............................................................................................................................................................................................
+        adcs x14, x12, x17                         // ..................................*.................................................................................................................................................................................
+        adc x23, xzr, xzr                          // .....................................*..............................................................................................................................................................................
+        adds x8, x29, x8                           // ......................................*.............................................................................................................................................................................
+        adcs x7, x5, x22                           // .......................................*............................................................................................................................................................................
+        mul x25, x8, x3                            // ..........................................*.........................................................................................................................................................................
+        usra v12.2D, v4.2D, #32                    // ........................*...........................................................................................................................................................................................
+        dup v8.2D, x25                             // ...........................................*........................................................................................................................................................................
+        stp x11, x25, [x1, #0]                     // ..............................................................................*.....................................................................................................................................
+        mul x22, x25, x4                           // ...............................................................*....................................................................................................................................................
+        mov x16, v12.d[1]                          // ....................................*...............................................................................................................................................................................
+        ldr q16, [x1, #0]                          // .......................................................................................................................................*............................................................................
+        mov x21, v12.d[0]                          // ...................................*................................................................................................................................................................................
+        mul v31.4S, v27.4S, v8.4S                  // ...................................................*................................................................................................................................................................
+        adcs x20, x14, x21                         // ........................................*...........................................................................................................................................................................
+        xtn v27.2S, v8.2D                          // .............................................*......................................................................................................................................................................
+        adc x10, x23, x16                          // .........................................*..........................................................................................................................................................................
+        subs x14, x11, x25                         // .........................................................................................................................................*..........................................................................
+        rev64 v17.4S, v16.4S                       // ...................................................................................................................................................................*................................................
+        cneg x17, x14, cc                          // ..........................................................................................................................................*.........................................................................
+        csetm x26, cc                              // ...........................................................................................................................................*........................................................................
+        uaddlp v26.2D, v31.4S                      // ......................................................*.............................................................................................................................................................
+        mul x6, x25, x15                           // ..................................................................*.................................................................................................................................................
+        stp x17, x26, [sp, #cache_a01]             // ............................................................................................................................................*.......................................................................
+        umull v24.2D, v27.2S, v14.2S               // ................................................*...................................................................................................................................................................
+        uzp2 v30.4S, v16.4S, v16.4S                // .................................................................................................................................................................*..................................................
+        shl v4.2D, v26.2D, #32                     // .........................................................*..........................................................................................................................................................
+        uzp2 v5.4S, v8.4S, v8.4S                   // ..................................................*.................................................................................................................................................................
+        umulh x17, x25, x4                         // .................................................................*..................................................................................................................................................
+        umlal v4.2D, v27.2S, v14.2S                // ...........................................................*........................................................................................................................................................
+        umull v8.2D, v27.2S, v21.2S                // .................................................*..................................................................................................................................................................
+        mov x21, v4.d[0]                           // .............................................................*......................................................................................................................................................
+        adds x8, x8, x22                           // ................................................................*...................................................................................................................................................
+        mov x12, v4.d[1]                           // ..............................................................*.....................................................................................................................................................
+        ldp x23, x14, [x2, #16]                    // ....*...............................................................................................................................................................................................................
+        adcs x29, x7, x6                           // ...................................................................*................................................................................................................................................
+        umulh x13, x25, x15                        // ....................................................................*...............................................................................................................................................
+        usra v8.2D, v24.2D, #32                    // ....................................................*...............................................................................................................................................................
+        ldp x8, x24, [x30, #cache_m20]             // ...........................................................................................................................................................................................................*........
+        adcs x9, x20, x21                          // .....................................................................*..............................................................................................................................................
+        ldr q9, [x2, #32]!                         // .......................................................................................................................................................................*............................................
+        xtn v28.2S, v16.2D                         // ..................................................................................................................................................................*.................................................
+        adcs x19, x10, x12                         // ......................................................................*.............................................................................................................................................
+        ldr q13, [x2, #16]                         // ........................................................................................................................................................................*...........................................
+        umull v18.2D, v5.2S, v21.2S                // .....................................................*..............................................................................................................................................................
+        adc x7, xzr, xzr                           // .........................................................................*..........................................................................................................................................
+        adds x5, x29, x17                          // ..........................................................................*.........................................................................................................................................
+        xtn v21.2S, v1.2D                          // ...................................................................................*................................................................................................................................
+        mul x12, x5, x3                            // ...............................................................................*....................................................................................................................................
+        and v4.16B, v8.16B, v29.16B                // .......................................................*............................................................................................................................................................
+        adcs x21, x9, x13                          // ...........................................................................*........................................................................................................................................
+        uzp2 v31.4S, v9.4S, v9.4S                  // ...............................................................................................................................................................................*....................................
+        xtn v23.2S, v9.2D                          // .........................................................................................................................................................................*..........................................
+        usra v18.2D, v8.2D, #32                    // ..........................................................*.........................................................................................................................................................
+        umlal v4.2D, v5.2S, v14.2S                 // ........................................................*...........................................................................................................................................................
+        dup v5.2D, x12                             // ................................................................................*...................................................................................................................................
+        umull v16.2D, v23.2S, v30.2S               // ............................................................................................................................................................................*.......................................
+        umull v1.2D, v23.2S, v28.2S                // ..............................................................................................................................................................................*.....................................
+        umulh x29, x12, x15                        // .........................................................................................................*..........................................................................................................
+        umull v8.2D, v31.2S, v30.2S                // ....................................................................................................................................................................................*...............................
+        xtn v24.2S, v13.2D                         // ..........................................................................................................................................................................*.........................................
+        mul v25.4S, v2.4S, v5.4S                   // ........................................................................................*...........................................................................................................................
+        usra v18.2D, v4.2D, #32                    // ............................................................*.......................................................................................................................................................
+        xtn v3.2S, v5.2D                           // ..................................................................................*.................................................................................................................................
+        uzp2 v19.4S, v5.4S, v5.4S                  // .......................................................................................*............................................................................................................................
+        mul x10, x12, x15                          // .......................................................................................................*............................................................................................................
+        umull v26.2D, v3.2S, v20.2S                // ......................................................................................*.............................................................................................................................
+        mov x22, v18.d[0]                          // .......................................................................*............................................................................................................................................
+        umull v10.2D, v3.2S, v21.2S                // .....................................................................................*..............................................................................................................................
+        uaddlp v11.2D, v25.4S                      // ...........................................................................................*........................................................................................................................
+        mov x6, v18.d[1]                           // ........................................................................*...........................................................................................................................................
+        mul x16, x12, x4                           // ....................................................................................................*...............................................................................................................
+        umull v4.2D, v19.2S, v20.2S                // ..........................................................................................*.........................................................................................................................
+        usra v16.2D, v1.2D, #32                    // ...................................................................................................................................................................................*................................
+        adcs x13, x19, x22                         // ............................................................................*.......................................................................................................................................
+        shl v11.2D, v11.2D, #32                    // ..............................................................................................*.....................................................................................................................
+        adc x6, x7, x6                             // .............................................................................*......................................................................................................................................
+        subs x7, x11, x12                          // .............................................................................................................................................*......................................................................
+        usra v26.2D, v10.2D, #32                   // .........................................................................................*..........................................................................................................................
+        csetm x26, cc                              // ...............................................................................................................................................*....................................................................
+        cneg x20, x7, cc                           // ..............................................................................................................................................*.....................................................................
+        subs x19, x25, x12                         // .....................................................................................................................................................*..............................................................
+        umlal v11.2D, v3.2S, v21.2S                // ................................................................................................*...................................................................................................................
+        cneg x9, x19, cc                           // ......................................................................................................................................................*.............................................................
+        stp x20, x26, [sp, #cache_a02]             // ................................................................................................................................................*...................................................................
+        umulh x7, x12, x4                          // ......................................................................................................*.............................................................................................................
+        usra v8.2D, v16.2D, #32                    // ........................................................................................................................................................................................*...........................
+        mul v7.4S, v17.4S, v9.4S                   // ................................................................................................................................................................................................*...................
+        csetm x26, cc                              // .......................................................................................................................................................*............................................................
+        adds x19, x5, x16                          // .....................................................................................................*..............................................................................................................
+        and v1.16B, v16.16B, v29.16B               // .......................................................................................................................................................................................*............................
+        adcs x21, x21, x10                         // ........................................................................................................*...........................................................................................................
+        stp x9, x26, [sp, #cache_a12]              // ........................................................................................................................................................*...........................................................
+        ldp x17, x20, [sp, #cache_a02]             // .....................................................................................................................................................................................................*..............
+        usra v4.2D, v26.2D, #32                    // ...............................................................................................*....................................................................................................................
+        and v18.16B, v26.16B, v29.16B              // ............................................................................................*.......................................................................................................................
+        umlal v1.2D, v31.2S, v28.2S                // ..........................................................................................................................................................................................*.........................
+        mov x22, v11.d[0]                          // ..................................................................................................*.................................................................................................................
+        mov x16, v11.d[1]                          // ...................................................................................................*................................................................................................................
+        umlal v18.2D, v19.2S, v21.2S               // .............................................................................................*......................................................................................................................
+        adcs x19, x13, x22                         // ..........................................................................................................*.........................................................................................................
+        mul x22, x17, x8                           // ...............................................................................................................................................................................................................*....
+        uaddlp v5.2D, v7.4S                        // ..................................................................................................................................................................................................*.................
+        adcs x13, x6, x16                          // ...........................................................................................................*........................................................................................................
+        usra v8.2D, v1.2D, #32                     // ..............................................................................................................................................................................................*.....................
+        adc x9, xzr, xzr                           // ..............................................................................................................*.....................................................................................................
+        adds x5, x21, x7                           // ...............................................................................................................*....................................................................................................
+        usra v4.2D, v18.2D, #32                    // .................................................................................................*..................................................................................................................
+        adcs x6, x19, x29                          // .................................................................................................................*..................................................................................................
+        mul x19, x5, x3                            // ................................................................................................................*...................................................................................................
+        shl v15.2D, v5.2D, #32                     // ......................................................................................................................................................................................................*.............
+        mov x3, v8.d[1]                            // ...................................................................................................................................................................................................*................
+        umlal v15.2D, v23.2S, v28.2S               // .......................................................................................................................................................................................................*............
+        mov x21, v4.d[0]                           // ............................................................................................................*.......................................................................................................
+        mul x7, x19, x23                           // .......................................................................................................................*............................................................................................
+        stp x12, x19, [x1, #16]                    // ....................................................................................................................*...............................................................................................
+        mov x10, v4.d[1]                           // .............................................................................................................*......................................................................................................
+        ldr q9, [x1, #16]                          // ........................................................................................................................................*...........................................................................
+        adcs x13, x13, x21                         // ..................................................................................................................*.................................................................................................
+        mov x21, v15.d[1]                          // ............................................................................................................................................................................................................*.......
+        mul x16, x19, x4                           // .....................................................................................................................*..............................................................................................
+        adc x9, x9, x10                            // ...................................................................................................................*................................................................................................
+        subs x29, x25, x19                         // .........................................................................................................................................................*..........................................................
+        csetm x26, cc                              // ...........................................................................................................................................................*........................................................
+        cneg x10, x29, cc                          // ..........................................................................................................................................................*.........................................................
+        subs x29, x12, x19                         // .............................................................................................................................................................*......................................................
+        stp x10, x26, [sp, #cache_a13]             // ............................................................................................................................................................*.......................................................
+        uzp2 v18.4S, v9.4S, v9.4S                  // ....................................................................................................................................................................*...............................................
+        mul x12, x19, x15                          // ......................................................................................................................*.............................................................................................
+        rev64 v20.4S, v9.4S                        // ......................................................................................................................................................................*.............................................
+        xtn v19.2S, v9.2D                          // .....................................................................................................................................................................*..............................................
+        umull v25.2D, v24.2S, v18.2S               // .............................................................................................................................................................................*......................................
+        csetm x26, cc                              // ...............................................................................................................................................................*....................................................
+        umull v14.2D, v24.2S, v19.2S               // ................................................................................................................................................................................*...................................
+        cneg x29, x29, cc                          // ..............................................................................................................................................................*.....................................................
+        umulh x10, x19, x23                        // ..............................................................................................................................*.....................................................................................
+        adds x25, x5, x16                          // .........................................................................................................................*..........................................................................................
+        mul v7.4S, v20.4S, v13.4S                  // .................................................................................................................................................................................*..................................
+        adcs x12, x6, x12                          // ...........................................................................................................................*........................................................................................
+        ldp x6, x5, [sp, #cache_a01]               // ..........................................................................................................................................................................................................*.........
+        mov x16, v8.d[0]                           // .........................................................................................................................................................................................................*..........
+        adcs x25, x13, x7                          // .............................................................................................................................*......................................................................................
+        stp x29, x26, [sp, #cache_a23]             // ................................................................................................................................................................*...................................................
+        usra v25.2D, v14.2D, #32                   // .....................................................................................................................................................................................*..............................
+        mul x29, x19, x14                          // ........................................................................................................................*...........................................................................................
+        uzp2 v1.4S, v13.4S, v13.4S                 // ...........................................................................................................................................................................*........................................
+        uaddlp v7.2D, v7.4S                        // ......................................................................................................................................................................................*.............................
+        umull v0.2D, v1.2S, v18.2S                 // ..................................................................................................................................................................................*.................................
+        umulh x13, x19, x4                         // ..........................................................................................................................*.........................................................................................
+        and v10.16B, v25.16B, v29.16B              // .........................................................................................................................................................................................*..........................
+        shl v13.2D, v7.2D, #32                     // ............................................................................................................................................................................................*.......................
+        adcs x4, x9, x29                           // ...............................................................................................................................*....................................................................................
+        umlal v10.2D, v1.2S, v19.2S                // .............................................................................................................................................................................................*......................
+        adc x9, xzr, xzr                           // .................................................................................................................................*..................................................................................
+        subs x29, x11, x19                         // .................................................................................................................................................*..................................................................
+        usra v0.2D, v25.2D, #32                    // ...........................................................................................................................................................................................*........................
+        eor x11, x20, x24                          // ................................................................................................................................................................................................................*...
+        umulh x15, x19, x15                        // ............................................................................................................................*.......................................................................................
+        umlal v13.2D, v24.2S, v19.2S               // ...............................................................................................................................................................................................*....................
+        cneg x7, x29, cc                           // ..................................................................................................................................................*.................................................................
+        ldp x20, x29, [x1, #32]!                   // .................................................................................................................................................................................................................*..
+        csetm x26, cc                              // ...................................................................................................................................................*................................................................
+        usra v0.2D, v10.2D, #32                    // .................................................................................................................................................................................................*..................
+        umulh x19, x19, x14                        // ................................................................................................................................*...................................................................................
+        mov x23, v13.d[1]                          // .............................................................................................................................................................................................................*......
+        stp x7, x26, [sp, #cache_a03]              // ....................................................................................................................................................*...............................................................
+        adds x12, x12, x13                         // ..................................................................................................................................*.................................................................................
+        adcs x13, x25, x15                         // ...................................................................................................................................*................................................................................
+        mov x26, v0.d[0]                           // ....................................................................................................................................................................................................*...............
+        umulh x8, x17, x8                          // ..................................................................................................................................................................................................................*.
+        adcs x14, x4, x10                          // ....................................................................................................................................*...............................................................................
+        mov x17, v13.d[0]                          // ........................................................................................................................................................................................................*...........
+        adc x15, x9, x19                           // .....................................................................................................................................*..............................................................................
+        ldp x24, x10, [x30], #96                   // ..............................................................................................................................................................................................................*.....
+
+bignum_emontredc_8n_cdiff_maddloop_neon:
+
+        ldr q14, [x2, #32]!                          // e....................................................................................................................................................
+        ldr q25, [x2, #16]                           // .e...................................................................................................................................................
+        eor x19, x5, x10                             // .................................................................................*...................................................................
+        adds x25, x21, x16                           // .....................................*...............................................................................................................
+        mov x16, v0.d[1]                             // .................................*...................................................................................................................
+        ldp x4, x7, [x1, #16]                        // .............................................*.......................................................................................................
+        adcs x21, x17, x3                            // ......................................*..............................................................................................................
+        eor x22, x22, x11                            // .................................................................................................................*...................................
+        adcs x23, x23, x26                           // .......................................*.............................................................................................................
+        adc x17, x16, xzr                            // ........................................*............................................................................................................
+        adds x16, x12, x20                           // ...........................................*.........................................................................................................
+        mul x5, x6, x24                              // ..................................................................................*..................................................................
+        xtn v21.2S, v14.2D                           // ..e..................................................................................................................................................
+        xtn v31.2S, v25.2D                           // ................e....................................................................................................................................
+        adcs x9, x13, x29                            // ............................................*........................................................................................................
+        uzp2 v24.4S, v25.4S, v25.4S                  // ...................e.................................................................................................................................
+        mov x29, v15.d[0]                            // .........................................*...........................................................................................................
+        adcs x4, x14, x4                             // ..............................................*......................................................................................................
+        ldp x10, x13, [sp, #cache_a23]               // ....................................................................*................................................................................
+        umull v5.2D, v21.2S, v30.2S                  // ....e................................................................................................................................................
+        umulh x20, x6, x24                           // ...................................................................................*.................................................................
+        adcs x24, x15, x7                            // ...............................................*.....................................................................................................
+        ldp x12, x7, [x30, #cache_m32 - 96]          // .....................................................................*...............................................................................
+        umull v16.2D, v31.2S, v18.2S                 // ..................e..................................................................................................................................
+        adc x6, xzr, xzr                             // ................................................*....................................................................................................
+        adds x14, x25, x29                           // .................................................*...................................................................................................
+        umull v13.2D, v21.2S, v28.2S                 // ...e.................................................................................................................................................
+        uzp2 v10.4S, v14.4S, v14.4S                  // .....e...............................................................................................................................................
+        eor x15, x8, x11                             // ...................................................................................................................*.................................
+        adcs x25, x21, x25                           // ..................................................*..................................................................................................
+        umull v1.2D, v31.2S, v19.2S                  // .................e...................................................................................................................................
+        adcs x8, x23, x21                            // ...................................................*.................................................................................................
+        mul v6.4S, v20.4S, v25.4S                    // ....................e................................................................................................................................
+        eor x7, x13, x7                              // ......................................................................*..............................................................................
+        adcs x23, x17, x23                           // ....................................................*................................................................................................
+        eor x21, x5, x19                             // .....................................................................................*...............................................................
+        adc x13, xzr, x17                            // .....................................................*...............................................................................................
+        adds x17, x25, x29                           // ......................................................*..............................................................................................
+        umull v0.2D, v24.2S, v18.2S                  // ......................e..............................................................................................................................
+        usra v5.2D, v13.2D, #32                      // .......e.............................................................................................................................................
+        adcs x5, x8, x14                             // .......................................................*.............................................................................................
+        umull v2.2D, v10.2S, v30.2S                  // ........e............................................................................................................................................
+        adcs x25, x23, x25                           // ........................................................*............................................................................................
+        usra v16.2D, v1.2D, #32                      // .....................e...............................................................................................................................
+        adcs x8, x13, x8                             // .........................................................*...........................................................................................
+        uaddlp v13.2D, v6.4S                         // .......................e.............................................................................................................................
+        adcs x23, xzr, x23                           // ..........................................................*..........................................................................................
+        and v7.16B, v5.16B, v29.16B                  // ..........e..........................................................................................................................................
+        adc x13, xzr, x13                            // ...........................................................*.........................................................................................
+        adds x29, x29, x16                           // ............................................................*........................................................................................
+        mul x16, x10, x12                            // .......................................................................*.............................................................................
+        usra v2.2D, v5.2D, #32                       // .............e.......................................................................................................................................
+        adcs x9, x14, x9                             // .............................................................*.......................................................................................
+        and v25.16B, v16.16B, v29.16B                // ........................e............................................................................................................................
+        adcs x17, x17, x4                            // ..............................................................*......................................................................................
+        umlal v7.2D, v10.2S, v28.2S                  // ...........e.........................................................................................................................................
+        umulh x12, x10, x12                          // ........................................................................*............................................................................
+        adcs x10, x5, x24                            // ...............................................................*.....................................................................................
+        usra v0.2D, v16.2D, #32                      // ...........................e.........................................................................................................................
+        eor x5, x16, x7                              // ..........................................................................*..........................................................................
+        ldp x16, x14, [x30, #cache_m31 - 96]         // ................................................................................................*....................................................
+        adcs x6, x25, x6                             // ................................................................*....................................................................................
+        shl v16.2D, v13.2D, #32                      // ..........................e..........................................................................................................................
+        eor x24, x20, x19                            // .......................................................................................*.............................................................
+        adcs x4, x8, xzr                             // .................................................................*...................................................................................
+        ldp x20, x25, [sp, #cache_a13]               // ...............................................................................................*.....................................................
+        umlal v25.2D, v24.2S, v19.2S                 // .........................e...........................................................................................................................
+        adcs x23, x23, xzr                           // ..................................................................*..................................................................................
+        usra v2.2D, v7.2D, #32                       // ...............e.....................................................................................................................................
+        umlal v16.2D, v31.2S, v19.2S                 // ............................e........................................................................................................................
+        adc x8, x13, xzr                             // ...................................................................*.................................................................................
+        adds xzr, x7, #1                             // .........................................................................*...........................................................................
+        mul v7.4S, v17.4S, v14.4S                    // ......e..............................................................................................................................................
+        adcs x4, x4, x5                              // ...........................................................................*.........................................................................
+        eor x5, x12, x7                              // ............................................................................*........................................................................
+        adcs x23, x23, x5                            // .............................................................................*.......................................................................
+        mul x12, x20, x16                            // ..................................................................................................*..................................................
+        adc x5, x8, x7                               // ..............................................................................*......................................................................
+        adds xzr, x19, #1                            // ....................................................................................*................................................................
+        adcs x21, x9, x21                            // ......................................................................................*..............................................................
+        eor x8, x25, x14                             // .................................................................................................*...................................................
+        usra v0.2D, v25.2D, #32                      // .............................e.......................................................................................................................
+        adcs x13, x17, x24                           // ........................................................................................*............................................................
+        stp x29, x21, [x1, #0]                       // ..............................................................................................*......................................................
+        umulh x20, x20, x16                          // ...................................................................................................*.................................................
+        uaddlp v10.2D, v7.4S                         // .........e...........................................................................................................................................
+        adcs x17, x10, x19                           // .........................................................................................*...........................................................
+        mov x3, v2.d[1]                              // ................................e....................................................................................................................
+        ldp x29, x24, [sp, #cache_a03]               // .........................................................................................................................*...........................
+        adcs x25, x6, x19                            // ..........................................................................................*..........................................................
+        ldp x6, x21, [x30, #cache_m30 - 96]          // ..........................................................................................................................*..........................
+        eor x10, x12, x8                             // .....................................................................................................*...............................................
+        adcs x9, x4, x19                             // ...........................................................................................*.........................................................
+        mov x26, v0.d[0]                             // ...............................e.....................................................................................................................
+        ldp x4, x16, [x30, #cache_m21 - 96]          // .......................................................................................................................................*.............
+        adcs x12, x23, x19                           // ............................................................................................*........................................................
+        adc x5, x5, x19                              // .............................................................................................*.......................................................
+        adds xzr, x8, #1                             // ....................................................................................................*................................................
+        ldp x7, x19, [sp, #cache_a12]                // ......................................................................................................................................*..............
+        adcs x14, x25, x10                           // ......................................................................................................*..............................................
+        mul x25, x29, x6                             // ............................................................................................................................*........................
+        eor x20, x20, x8                             // .......................................................................................................*.............................................
+        adcs x23, x9, x20                            // ........................................................................................................*............................................
+        ldp x9, x20, [sp, #cache_a02]                // ...........................................................................................................e.........................................
+        eor x24, x24, x21                            // ...........................................................................................................................*.........................
+        adcs x12, x12, x8                            // .........................................................................................................*...........................................
+        adc x10, x5, x8                              // ..........................................................................................................*..........................................
+        adds xzr, x11, #1                            // ................................................................................................................*....................................
+        umulh x5, x29, x6                            // .............................................................................................................................*.......................
+        shl v15.2D, v10.2D, #32                      // ............e........................................................................................................................................
+        adcs x8, x13, x22                            // ..................................................................................................................*..................................
+        eor x13, x25, x24                            // ...............................................................................................................................*.....................
+        adcs x29, x17, x15                           // ....................................................................................................................*................................
+        umlal v15.2D, v21.2S, v28.2S                 // ..............e......................................................................................................................................
+        adcs x22, x14, x11                           // .....................................................................................................................*...............................
+        mov x17, v16.d[0]                            // ...................................e.................................................................................................................
+        adcs x21, x23, x11                           // ......................................................................................................................*..............................
+        mul x23, x7, x4                              // .........................................................................................................................................*...........
+        adcs x14, x12, x11                           // .......................................................................................................................*.............................
+        eor x12, x19, x16                            // ........................................................................................................................................*............
+        mov x16, v2.d[0]                             // ..............................e......................................................................................................................
+        adc x15, x10, x11                            // ........................................................................................................................*............................
+        adds xzr, x24, #1                            // ..............................................................................................................................*......................
+        eor x19, x5, x24                             // .................................................................................................................................*...................
+        adcs x11, x29, x13                           // ................................................................................................................................*....................
+        umulh x29, x7, x4                            // ..........................................................................................................................................*..........
+        adcs x13, x22, x19                           // ..................................................................................................................................*..................
+        ldp x6, x5, [sp, #cache_a01]                 // ...............................................................................e.....................................................................
+        ldp x7, x25, [x30, #cache_m20]               // ............................................................................................................e........................................
+        adcs x19, x21, x24                           // ...................................................................................................................................*.................
+        mov x21, v15.d[1]                            // ..................................e..................................................................................................................
+        eor x22, x23, x12                            // ............................................................................................................................................*........
+        adcs x14, x14, x24                           // ....................................................................................................................................*................
+        mov x23, v16.d[1]                            // ....................................e................................................................................................................
+        adc x15, x15, x24                            // .....................................................................................................................................*...............
+        adds xzr, x12, #1                            // ...........................................................................................................................................*.........
+        ldp x24, x10, [x30], #96                     // ................................................................................e....................................................................
+        adcs x11, x11, x22                           // .............................................................................................................................................*.......
+        mul x22, x9, x7                              // ..............................................................................................................e......................................
+        eor x4, x29, x12                             // ...............................................................................................................................................*.....
+        adcs x4, x13, x4                             // ................................................................................................................................................*....
+        stp x8, x11, [x1, #16]                       // ..............................................................................................................................................*......
+        adcs x13, x19, x12                           // .................................................................................................................................................*...
+        eor x11, x20, x25                            // .............................................................................................................e.......................................
+        ldp x20, x29, [x1, #32]!                     // ..........................................e..........................................................................................................
+        adcs x14, x14, x12                           // ..................................................................................................................................................*..
+        adc x15, x15, x12                            // ...................................................................................................................................................*.
+        mov x12, x4                                  // ....................................................................................................................................................*
+        umulh x8, x9, x7                             // ...............................................................................................................e.....................................
+
+        sub count, count, #1
+        cbnz count, bignum_emontredc_8n_cdiff_maddloop_neon
+bignum_emontredc_8n_cdiff_inner_loop_postamble:
+        umulh x19, x6, x24                           // ..............*...........................................................................................................
+        ldp x7, x9, [sp, #cache_a23]                 // .............*............................................................................................................
+        adds x4, x21, x16                            // .*........................................................................................................................
+        mov x25, v0.d[1]                             // ..*.......................................................................................................................
+        eor x5, x5, x10                              // *.........................................................................................................................
+        adcs x17, x17, x3                            // ....*.....................................................................................................................
+        ldp x16, x10, [x1, #16]                      // ...*......................................................................................................................
+        adcs x21, x23, x26                           // ......*...................................................................................................................
+        eor x8, x8, x11                              // ...................*......................................................................................................
+        adc x23, x25, xzr                            // .......*..................................................................................................................
+        adds x20, x12, x20                           // ........*.................................................................................................................
+        adcs x12, x13, x29                           // ..........*...............................................................................................................
+        mov x25, v15.d[0]                            // ...........*..............................................................................................................
+        adcs x13, x14, x16                           // ............*.............................................................................................................
+        eor x16, x19, x5                             // .........................................*................................................................................
+        adcs x29, x15, x10                           // ...............*..........................................................................................................
+        ldp x14, x19, [x30, #cache_m32 - 96]         // ................*.........................................................................................................
+        mul x15, x6, x24                             // .........*................................................................................................................
+        adc x24, xzr, xzr                            // .................*........................................................................................................
+        adds x6, x4, x25                             // ..................*.......................................................................................................
+        adcs x10, x17, x4                            // ....................*.....................................................................................................
+        eor x4, x22, x11                             // .....*....................................................................................................................
+        adcs x17, x21, x17                           // .....................*....................................................................................................
+        eor x22, x9, x19                             // ......................*...................................................................................................
+        adcs x9, x23, x21                            // .......................*..................................................................................................
+        adc x21, xzr, x23                            // .........................*................................................................................................
+        adds x23, x10, x25                           // ..........................*...............................................................................................
+        eor x15, x15, x5                             // ........................*.................................................................................................
+        adcs x19, x17, x6                            // ...........................*..............................................................................................
+        ldp x26, xzr, [sp, #16]                      // ...........................................................................................................*..............
+        sub x2, x2, x0                               // .....................................................................................................................*....
+        adcs x10, x9, x10                            // ............................*.............................................................................................
+        adcs x17, x21, x17                           // .............................*............................................................................................
+        adcs x9, xzr, x9                             // ..............................*...........................................................................................
+        adc x21, xzr, x21                            // ...............................*..........................................................................................
+        adds x25, x25, x20                           // ................................*.........................................................................................
+        mul x20, x7, x14                             // .................................*........................................................................................
+        adcs x6, x6, x12                             // ..................................*.......................................................................................
+        adcs x23, x23, x13                           // ...................................*......................................................................................
+        adcs x13, x19, x29                           // .....................................*....................................................................................
+        umulh x19, x7, x14                           // ....................................*.....................................................................................
+        ldp x14, x29, [x30, #cache_m31 - 96]         // .......................................*..................................................................................
+        adcs x10, x10, x24                           // ........................................*.................................................................................
+        ldp x12, x7, [sp, #cache_a13]                // ...........................................*..............................................................................
+        adcs x17, x17, xzr                           // ..........................................*...............................................................................
+        adcs x24, x9, xzr                            // ............................................*.............................................................................
+        adc x9, x21, xzr                             // .............................................*............................................................................
+        adds xzr, x22, #1                            // ..............................................*...........................................................................
+        eor x21, x20, x22                            // ......................................*...................................................................................
+        adcs x20, x17, x21                           // ...............................................*..........................................................................
+        eor x17, x19, x22                            // ................................................*.........................................................................
+        mul x19, x12, x14                            // ..................................................*.......................................................................
+        eor x29, x7, x29                             // ......................................................*...................................................................
+        adcs x17, x24, x17                           // .................................................*........................................................................
+        adc x9, x9, x22                              // ...................................................*......................................................................
+        adds xzr, x5, #1                             // ....................................................*.....................................................................
+        umulh x21, x12, x14                          // .........................................................*................................................................
+        ldp x14, x24, [sp, #cache_a03]               // ...........................................................*..............................................................
+        adcs x6, x6, x15                             // .....................................................*....................................................................
+        adcs x7, x23, x16                            // .......................................................*..................................................................
+        ldp x15, x12, [x30, #cache_m30 - 96]         // .............................................................*............................................................
+        eor x19, x19, x29                            // ..............................................................*...........................................................
+        adcs x13, x13, x5                            // ..........................................................*...............................................................
+        stp x25, x6, [x1, #0]                        // ........................................................*.................................................................
+        adcs x16, x10, x5                            // ............................................................*.............................................................
+        ldp x10, x6, [x30, #cache_m21 - 96]          // ................................................................*.........................................................
+        adcs x23, x20, x5                            // ...............................................................*..........................................................
+        adcs x25, x17, x5                            // .................................................................*........................................................
+        umulh x30, x14, x15                          // .............................................................................*............................................
+        ldp x17, x22, [sp, #cache_a12]               // ....................................................................*.....................................................
+        adc x9, x9, x5                               // ..................................................................*.......................................................
+        adds xzr, x29, #1                            // ...................................................................*......................................................
+        eor x21, x21, x29                            // .......................................................................*..................................................
+        adcs x16, x16, x19                           // .....................................................................*....................................................
+        adcs x5, x23, x21                            // ........................................................................*.................................................
+        eor x23, x24, x12                            // .........................................................................*................................................
+        mul x12, x17, x10                            // ...................................................................................*......................................
+        adcs x19, x25, x29                           // ..........................................................................*...............................................
+        adc x29, x9, x29                             // ...........................................................................*..............................................
+        adds xzr, x11, #1                            // ............................................................................*.............................................
+        adcs x7, x7, x4                              // ..............................................................................*...........................................
+        adcs x21, x13, x8                            // ................................................................................*.........................................
+        mul x4, x14, x15                             // ......................................................................*...................................................
+        eor x9, x30, x23                             // ........................................................................................*.................................
+        adcs x30, x16, x11                           // .................................................................................*........................................
+        adcs x24, x5, x11                            // ..................................................................................*.......................................
+        adcs x16, x19, x11                           // ....................................................................................*.....................................
+        adc x19, x29, x11                            // ......................................................................................*...................................
+        adds xzr, x23, #1                            // .......................................................................................*..................................
+        eor x8, x4, x23                              // ...............................................................................*..........................................
+        adcs x4, x21, x8                             // .........................................................................................*................................
+        umulh x21, x17, x10                          // ..........................................................................................*...............................
+        adcs x8, x30, x9                             // ...........................................................................................*..............................
+        ldp x10, x30, [x1, #32]                      // .........................................................................................................*................
+        adcs x25, x24, x23                           // ............................................................................................*.............................
+        eor x11, x22, x6                             // .....................................................................................*....................................
+        adcs x22, x16, x23                           // ..............................................................................................*...........................
+        eor x5, x12, x11                             // .............................................................................................*............................
+        adc x29, x19, x23                            // ...............................................................................................*..........................
+        adds xzr, x11, #1                            // ................................................................................................*.........................
+        eor x14, x21, x11                            // ..................................................................................................*.......................
+        adcs x9, x4, x5                              // .................................................................................................*........................
+        stp x7, x9, [x1, #16]                        // ....................................................................................................*.....................
+        adcs x9, x8, x14                             // ...................................................................................................*......................
+        adcs x19, x25, x11                           // .....................................................................................................*....................
+        ldp x8, x21, [x1, #48]                       // ..........................................................................................................*...............
+        mov x24, x9                                  // ........................................................................................................*.................
+        adcs x5, x22, x11                            // ......................................................................................................*...................
+        adc x16, x29, x11                            // .......................................................................................................*..................
+        adds xzr, x28, x28                           // ............................................................................................................*.............
+        adcs x17, x10, x24                           // .............................................................................................................*............
+        adcs x14, x30, x19                           // ..............................................................................................................*...........
+        ldr x30, [sp, #8]                            // .........................................................................................................................*
+        adcs x8, x8, x5                              // ...............................................................................................................*..........
+        stp x17, x14, [x1, #32]                      // ..................................................................................................................*.......
+        adcs x9, x21, x16                            // ................................................................................................................*.........
+        csetm x28, cs                                // .................................................................................................................*........
+        stp x8, x9, [x1, #48]                        // ...................................................................................................................*......
+        sub x26, x26, #1                             // .......................................................................................................................*..
+        sub x1, x1, x0                               // ....................................................................................................................*.....
+        stp x26, xzr, [sp, #16]                      // ........................................................................................................................*.
+        add x1, x1, #32                              // ......................................................................................................................*...
+
+bignum_emontredc_8n_cdiff_outer_loop_end:
+
+        cbnz x26, bignum_emontredc_8n_cdiff_outerloop
+        neg x0, x28
+
+bignum_emontredc_8n_cdiff_end:
+        add sp, sp, #32
+        add sp, sp, #(6*16)
+        ldp d8, d9, [sp, #(0*16)]
+        ldp d10, d11, [sp, #(1*16)]
+        ldp d12, d13, [sp, #(2*16)]
+        ldp d14, d15, [sp, #(3*16)]
+        ldp x29, x30, [sp, #(4*16)]
+        ldp x27, x28, [sp, #(5*16)]
+        ldp x25, x26, [sp, #(6*16)]
+        ldp x23, x24, [sp, #(7*16)]
+        ldp x21, x22, [sp, #(8*16)]
+        ldp x19, x20, [sp, #(9*16)]
+        add sp, sp, #(10*16)
+
+        ret
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_16_32.S
similarity index 97%
rename from third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_16_32.S
index 70a8311fcb5..b53104bfe6e 100644
--- a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_16_32.S
@@ -5,7 +5,7 @@
 // Multiply z := x * y
 // Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32]
 //
-//    extern void bignum_kmul_16_32_neon
+//    extern void bignum_kmul_16_32
 //     (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16],
 //      uint64_t t[static 32])
 //
@@ -16,8 +16,8 @@
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_16_32_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_16_32_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_16_32)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_16_32)
         .text
         .balign 4
 
@@ -33,7 +33,7 @@
 #define s x29
 #define m x19
 
-S2N_BN_SYMBOL(bignum_kmul_16_32_neon):
+S2N_BN_SYMBOL(bignum_kmul_16_32):
 
 // Save registers, including return address
 
@@ -53,7 +53,7 @@ S2N_BN_SYMBOL(bignum_kmul_16_32_neon):
 
 // Compute L = x_lo * y_lo in bottom half of buffer (size 8 x 8 -> 16)
 
-        bl      bignum_kmul_16_32_neon_local_mul_8_16
+        bl      bignum_kmul_16_32_local_mul_8_16
 
 // Compute absolute difference [t..] = |x_lo - x_hi|
 // and the sign s = sgn(x_lo - x_hi) as a bitmask (all 1s for negative)
@@ -102,7 +102,7 @@ S2N_BN_SYMBOL(bignum_kmul_16_32_neon):
         add     x0, z, #128
         add     x1, x, #64
         add     x2, y, #64
-        bl      bignum_kmul_16_32_neon_local_mul_8_16
+        bl      bignum_kmul_16_32_local_mul_8_16
 
 // Compute the other absolute difference [t+8..] = |y_hi - y_lo|
 // Collect the combined product sign bitmask (all 1s for negative) in s
@@ -199,7 +199,7 @@ S2N_BN_SYMBOL(bignum_kmul_16_32_neon):
         add     x0, t, #128
         mov     x1, t
         add     x2, t, #64
-        bl      bignum_kmul_16_32_neon_local_mul_8_16
+        bl      bignum_kmul_16_32_local_mul_8_16
 
 // Add the interlocking H' and L_bot terms, storing in registers x15..x0
 // Intercept the carry at the 8 + 16 = 24 position and store it in x.
@@ -330,10 +330,10 @@ S2N_BN_SYMBOL(bignum_kmul_16_32_neon):
         ret
 
 // ----------------------------------------------------------------------------
-// Local copy of bignum_mul_8_16_neon without the scratch register save/restore
+// Local copy of bignum_mul_8_16 without the scratch register save/restore
 // ----------------------------------------------------------------------------
 
-bignum_kmul_16_32_neon_local_mul_8_16:
+bignum_kmul_16_32_local_mul_8_16:
         ldp x3, x4, [x1]
         ldr q0, [x1]
         ldp x7, x8, [x2]
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_32_64.S
similarity index 98%
rename from third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_32_64.S
index fc716cbea84..313bbf020da 100644
--- a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_32_64.S
@@ -5,7 +5,7 @@
 // Multiply z := x * y
 // Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96]
 //
-//    extern void bignum_kmul_32_64_neon
+//    extern void bignum_kmul_32_64
 //     (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32],
 //      uint64_t t[static 96])
 //
@@ -16,8 +16,8 @@
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_32_64_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_32_64_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_32_64)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_32_64)
         .text
         .balign 4
 
@@ -31,7 +31,7 @@
 
 #define c x16
 
-S2N_BN_SYMBOL(bignum_kmul_32_64_neon):
+S2N_BN_SYMBOL(bignum_kmul_32_64):
 
 // Save extra registers and return address, store parameters safely
 
@@ -49,7 +49,7 @@ S2N_BN_SYMBOL(bignum_kmul_32_64_neon):
 
 // Compute L = x_lo * y_lo in bottom half of buffer (size 16 x 16 -> 32)
 
-        bl      bignum_kmul_32_64_neon_local_kmul_16_32
+        bl      bignum_kmul_32_64_local_kmul_16_32
 
 // Compute H = x_hi * y_hi in top half of buffer (size 16 x 16 -> 32)
 
@@ -57,7 +57,7 @@ S2N_BN_SYMBOL(bignum_kmul_32_64_neon):
         add     x1, x, #8*K
         add     x2, y, #8*K
         mov     x3, t
-        bl      bignum_kmul_32_64_neon_local_kmul_16_32
+        bl      bignum_kmul_32_64_local_kmul_16_32
 
 // Compute absolute difference [t..] = |x_lo - x_hi|
 // and the sign x = sgn(x_lo - x_hi) as a bitmask (all 1s for negative)
@@ -350,7 +350,7 @@ S2N_BN_SYMBOL(bignum_kmul_32_64_neon):
         mov     x1, t
         add     x2, t, #8*K
         add     x3, t, #32*K
-        bl      bignum_kmul_32_64_neon_local_kmul_16_32
+        bl      bignum_kmul_32_64_local_kmul_16_32
 
 // Add the interlocking H' and L_bot terms
 // Intercept the carry at the 3k position and store it in x.
@@ -644,11 +644,11 @@ S2N_BN_SYMBOL(bignum_kmul_32_64_neon):
         ldp     x19, x20, [sp], #16
         ret
 
-// Local copy of bignum_kmul_16_32_neon, identical to main one except that it
+// Local copy of bignum_kmul_16_32, identical to main one except that it
 // only preserves the key registers we need to be stable in the main code.
-// This includes in turn a copy of bignum_mul_8_16_neon.
+// This includes in turn a copy of bignum_mul_8_16.
 
-bignum_kmul_32_64_neon_local_kmul_16_32:
+bignum_kmul_32_64_local_kmul_16_32:
         stp     x19, x20, [sp, -16]!
         stp     x21, x22, [sp, -16]!
         stp     x23, x30, [sp, -16]!
@@ -656,7 +656,7 @@ bignum_kmul_32_64_neon_local_kmul_16_32:
         mov     x26, x1
         mov     x27, x2
         mov     x28, x3
-        bl      bignum_kmul_32_64_neon_local_mul_8_16
+        bl      bignum_kmul_32_64_local_mul_8_16
         ldp     x10, x11, [x26]
         ldp     x8, x9, [x26, #64]
         subs    x10, x10, x8
@@ -698,7 +698,7 @@ bignum_kmul_32_64_neon_local_kmul_16_32:
         add     x0, x25, #0x80
         add     x1, x26, #0x40
         add     x2, x27, #0x40
-        bl      bignum_kmul_32_64_neon_local_mul_8_16
+        bl      bignum_kmul_32_64_local_mul_8_16
         ldp     x10, x11, [x27]
         ldp     x8, x9, [x27, #64]
         subs    x10, x8, x10
@@ -777,7 +777,7 @@ bignum_kmul_32_64_neon_local_kmul_16_32:
         add     x0, x28, #0x80
         mov     x1, x28
         add     x2, x28, #0x40
-        bl      bignum_kmul_32_64_neon_local_mul_8_16
+        bl      bignum_kmul_32_64_local_mul_8_16
         ldp     x0, x1, [x25]
         ldp     x16, x17, [x25, #128]
         adds    x0, x0, x16
@@ -883,7 +883,7 @@ bignum_kmul_32_64_neon_local_kmul_16_32:
         ldp     x19, x20, [sp], #16
         ret
 
-bignum_kmul_32_64_neon_local_mul_8_16:
+bignum_kmul_32_64_local_mul_8_16:
         ldp x3, x4, [x1]
         ldr q0, [x1]
         ldp x7, x8, [x2]
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_16_32.S
similarity index 75%
rename from third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_16_32.S
index 7be2ac6c455..14873d9ef7c 100644
--- a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_16_32.S
@@ -263,156 +263,245 @@ S2N_BN_SYMBOL(bignum_ksqr_16_32):
 // -----------------------------------------------------------------------------
 
 bignum_ksqr_16_32_local_sqr_8_16:
+// Load registers.
         ldp     x2, x3, [x1]
+ldr     q20, [x1]
         ldp     x4, x5, [x1, #16]
+ldr     q21, [x1, #16]
         ldp     x6, x7, [x1, #32]
+ldr     q22, [x1, #32]
         ldp     x8, x9, [x1, #48]
+ldr     q23, [x1, #48]
+movi    v30.2d, #0xffffffff
+
         mul     x17, x2, x4
         mul     x14, x3, x5
+
+// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8
+// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3)
+ext     v1.16b, v20.16b, v20.16b, #8
         umulh   x20, x2, x4
+shrn    v2.2s, v20.2d, #32
         subs    x21, x2, x3
-        cneg    x21, x21, cc
-        csetm   x11, cc
+zip1    v0.2s, v20.2s, v1.2s
+        cneg    x21, x21, cc  // cc = lo, ul, last
+umull   v5.2d, v2.2s, v2.2s
+        csetm   x11, cc  // cc = lo, ul, last
+umull   v6.2d, v2.2s, v0.2s
         subs    x12, x5, x4
-        cneg    x12, x12, cc
+umull   v3.2d, v0.2s, v0.2s
+        cneg    x12, x12, cc  // cc = lo, ul, last
+mov     v1.16b, v6.16b
         mul     x13, x21, x12
+usra    v1.2d, v3.2d, #32
         umulh   x12, x21, x12
-        cinv    x11, x11, cc
+and     v4.16b, v1.16b, v30.16b
+        cinv    x11, x11, cc  // cc = lo, ul, last
+add     v4.2d, v4.2d, v6.2d
         eor     x13, x13, x11
+usra    v5.2d, v4.2d, #32
         eor     x12, x12, x11
+sli     v3.2d, v4.2d, #32
         adds    x19, x17, x20
+usra    v5.2d, v1.2d, #32
         adc     x20, x20, xzr
+  // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5)
+  ext   v1.16b, v21.16b, v21.16b, #8
         umulh   x21, x3, x5
+  shrn  v2.2s, v21.2d, #32
         adds    x19, x19, x14
+  zip1  v0.2s, v21.2s, v1.2s
         adcs    x20, x20, x21
         adc     x21, x21, xzr
         adds    x20, x20, x14
         adc     x21, x21, xzr
         cmn     x11, #0x1
         adcs    x19, x19, x13
+mov     x13, v3.d[1] // mul     x13, x3, x3
         adcs    x20, x20, x12
+mov     x14, v5.d[1] // umulh   x14, x3, x3
         adc     x21, x21, x11
+mov     x12, v3.d[0] // mul     x12, x2, x2
         adds    x17, x17, x17
+mov     x11, v5.d[0] // umulh   x11, x2, x2
         adcs    x19, x19, x19
+  umull v5.2d, v2.2s, v2.2s
         adcs    x20, x20, x20
+  umull v6.2d, v2.2s, v0.2s
         adcs    x21, x21, x21
+  umull v3.2d, v0.2s, v0.2s
         adc     x10, xzr, xzr
-        mul     x12, x2, x2
-        mul     x13, x3, x3
+  mov   v1.16b, v6.16b
+
         mul     x15, x2, x3
-        umulh   x11, x2, x2
-        umulh   x14, x3, x3
+  usra  v1.2d, v3.2d, #32
         umulh   x16, x2, x3
+  and   v4.16b, v1.16b, v30.16b
         adds    x11, x11, x15
+  add   v4.2d, v4.2d, v6.2d
         adcs    x13, x13, x16
+  usra  v5.2d, v4.2d, #32
         adc     x14, x14, xzr
+  sli   v3.2d, v4.2d, #32
         adds    x11, x11, x15
+  usra  v5.2d, v1.2d, #32
         adcs    x13, x13, x16
         adc     x14, x14, xzr
         stp     x12, x11, [x0]
+  mov   x11, v5.d[0] // umulh   x11, x4, x4
         adds    x17, x17, x13
+  mov   x13, v3.d[1] // mul     x13, x5, x5
         adcs    x19, x19, x14
+  mov   x14, v5.d[1] // umulh   x14, x5, x5
         adcs    x20, x20, xzr
+  mov   x12, v3.d[0] // mul     x12, x4, x4
         adcs    x21, x21, xzr
+// NEON: prepare muls in the upper half
+ext     v1.16b, v22.16b, v22.16b, #8
         adc     x10, x10, xzr
+shrn    v2.2s, v22.2d, #32
         stp     x17, x19, [x0, #16]
-        mul     x12, x4, x4
-        mul     x13, x5, x5
+zip1    v0.2s, v22.2s, v1.2s
         mul     x15, x4, x5
-        umulh   x11, x4, x4
-        umulh   x14, x5, x5
+umull   v5.2d, v2.2s, v2.2s
         umulh   x16, x4, x5
+umull   v6.2d, v2.2s, v0.2s
         adds    x11, x11, x15
+umull   v3.2d, v0.2s, v0.2s
         adcs    x13, x13, x16
+mov     v1.16b, v6.16b
         adc     x14, x14, xzr
+usra    v1.2d, v3.2d, #32
         adds    x11, x11, x15
+and     v4.16b, v1.16b, v30.16b
         adcs    x13, x13, x16
+add     v4.2d, v4.2d, v6.2d
         adc     x14, x14, xzr
+usra    v5.2d, v4.2d, #32
         adds    x12, x12, x20
+sli     v3.2d, v4.2d, #32
         adcs    x11, x11, x21
+usra    v5.2d, v1.2d, #32
         stp     x12, x11, [x0, #32]
+  // NEON: prepare muls in the upper half
+  ext   v1.16b, v23.16b, v23.16b, #8
         adcs    x13, x13, x10
+  shrn  v2.2s, v23.2d, #32
         adc     x14, x14, xzr
+  zip1  v0.2s, v23.2s, v1.2s
         stp     x13, x14, [x0, #48]
+
+// Scalar: square the upper half with a slight variant of the previous block
         mul     x17, x6, x8
+  umull v16.2d, v2.2s, v2.2s
         mul     x14, x7, x9
+  umull v6.2d, v2.2s, v0.2s
         umulh   x20, x6, x8
+  umull v18.2d, v0.2s, v0.2s
         subs    x21, x6, x7
-        cneg    x21, x21, cc
-        csetm   x11, cc
+        cneg    x21, x21, cc  // cc = lo, ul, last
+  mov   v1.16b, v6.16b
+        csetm   x11, cc  // cc = lo, ul, last
         subs    x12, x9, x8
-        cneg    x12, x12, cc
+        cneg    x12, x12, cc  // cc = lo, ul, last
+  usra  v1.2d, v18.2d, #32
         mul     x13, x21, x12
+  and   v4.16b, v1.16b, v30.16b
         umulh   x12, x21, x12
-        cinv    x11, x11, cc
+  add   v4.2d, v4.2d, v6.2d
+        cinv    x11, x11, cc  // cc = lo, ul, last
         eor     x13, x13, x11
         eor     x12, x12, x11
+  usra  v16.2d, v4.2d, #32
         adds    x19, x17, x20
         adc     x20, x20, xzr
+  sli   v18.2d, v4.2d, #32
         umulh   x21, x7, x9
         adds    x19, x19, x14
         adcs    x20, x20, x21
         adc     x21, x21, xzr
         adds    x20, x20, x14
+mov     x14, v5.d[1]
         adc     x21, x21, xzr
         cmn     x11, #0x1
         adcs    x19, x19, x13
+mov     x13, v3.d[1]
         adcs    x20, x20, x12
+mov     x12, v3.d[0]
         adc     x21, x21, x11
+mov     x11, v5.d[0]
         adds    x17, x17, x17
         adcs    x19, x19, x19
+  usra  v16.2d, v1.2d, #32
         adcs    x20, x20, x20
         adcs    x21, x21, x21
         adc     x10, xzr, xzr
-        mul     x12, x6, x6
-        mul     x13, x7, x7
+// NEON: two mul+umulhs for the next stage
+uzp2    v17.4s, v21.4s, v23.4s
         mul     x15, x6, x7
-        umulh   x11, x6, x6
-        umulh   x14, x7, x7
+xtn     v4.2s, v23.2d
         umulh   x16, x6, x7
+  mov   x22, v16.d[0]
         adds    x11, x11, x15
         adcs    x13, x13, x16
+xtn     v5.2s, v21.2d
         adc     x14, x14, xzr
         adds    x11, x11, x15
+rev64   v1.4s, v21.4s
         adcs    x13, x13, x16
         adc     x14, x14, xzr
         stp     x12, x11, [x0, #64]
         adds    x17, x17, x13
+  mov   x13, v18.d[1]
         adcs    x19, x19, x14
+  mov   x14, v16.d[1]
         adcs    x20, x20, xzr
+  mov   x12, v18.d[0]
         adcs    x21, x21, xzr
         adc     x10, x10, xzr
+umull   v6.2d, v4.2s, v5.2s
         stp     x17, x19, [x0, #80]
-        mul     x12, x8, x8
-        mul     x13, x9, x9
+umull   v7.2d, v4.2s, v17.2s
         mul     x15, x8, x9
-        umulh   x11, x8, x8
-        umulh   x14, x9, x9
+uzp2    v16.4s, v23.4s, v23.4s
         umulh   x16, x8, x9
-        adds    x11, x11, x15
+mul     v0.4s, v1.4s, v23.4s
+        adds    x11, x22, x15
         adcs    x13, x13, x16
+usra    v7.2d, v6.2d, #32
         adc     x14, x14, xzr
         adds    x11, x11, x15
+umull   v1.2d, v16.2s, v17.2s
         adcs    x13, x13, x16
         adc     x14, x14, xzr
+uaddlp  v0.2d, v0.4s
         adds    x12, x12, x20
         adcs    x11, x11, x21
+and     v2.16b, v7.16b, v30.16b
+umlal   v2.2d, v16.2s, v5.2s
+shl     v0.2d, v0.2d, #32
+usra    v1.2d, v7.2d, #32
+umlal   v0.2d, v4.2s, v5.2s
+mov     x16, v0.d[1]
+mov     x15, v0.d[0]
+usra    v1.2d, v2.2d, #32
+mov     x20, v1.d[0]
+mov     x21, v1.d[1]
         stp     x12, x11, [x0, #96]
         adcs    x13, x13, x10
         adc     x14, x14, xzr
         stp     x13, x14, [x0, #112]
+
+// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0]
+
         mul     x10, x2, x6
         mul     x14, x3, x7
-        mul     x15, x4, x8
-        mul     x16, x5, x9
         umulh   x17, x2, x6
         adds    x14, x14, x17
         umulh   x17, x3, x7
         adcs    x15, x15, x17
-        umulh   x17, x4, x8
-        adcs    x16, x16, x17
-        umulh   x17, x5, x9
-        adc     x17, x17, xzr
+        adcs    x16, x16, x20
+        adc     x17, x21, xzr
         adds    x11, x14, x10
         adcs    x14, x15, x14
         adcs    x15, x16, x15
@@ -425,13 +514,13 @@ bignum_ksqr_16_32_local_sqr_8_16:
         adcs    x16, xzr, x16
         adc     x17, xzr, x17
         subs    x22, x4, x5
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x9, x8
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x15, x15, x21
@@ -439,13 +528,13 @@ bignum_ksqr_16_32_local_sqr_8_16:
         adcs    x16, x16, x20
         adc     x17, x17, x19
         subs    x22, x2, x3
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x7, x6
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x11, x11, x21
@@ -457,13 +546,13 @@ bignum_ksqr_16_32_local_sqr_8_16:
         adcs    x16, x16, x19
         adc     x17, x17, x19
         subs    x22, x3, x5
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x9, x7
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x14, x14, x21
@@ -472,13 +561,13 @@ bignum_ksqr_16_32_local_sqr_8_16:
         adcs    x16, x16, x19
         adc     x17, x17, x19
         subs    x22, x2, x4
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x8, x6
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x12, x12, x21
@@ -489,13 +578,13 @@ bignum_ksqr_16_32_local_sqr_8_16:
         adcs    x16, x16, x19
         adc     x17, x17, x19
         subs    x22, x2, x5
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x9, x6
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x13, x13, x21
@@ -505,13 +594,13 @@ bignum_ksqr_16_32_local_sqr_8_16:
         adcs    x16, x16, x19
         adc     x17, x17, x19
         subs    x22, x3, x4
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x8, x7
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x13, x13, x21
@@ -529,30 +618,39 @@ bignum_ksqr_16_32_local_sqr_8_16:
         adcs    x16, x16, x16
         adcs    x17, x17, x17
         adc     x19, xzr, xzr
+
+// Add it back to the buffer
+
         ldp     x2, x3, [x0, #32]
         adds    x10, x10, x2
         adcs    x11, x11, x3
         stp     x10, x11, [x0, #32]
+
         ldp     x2, x3, [x0, #48]
         adcs    x12, x12, x2
         adcs    x13, x13, x3
         stp     x12, x13, [x0, #48]
+
         ldp     x2, x3, [x0, #64]
         adcs    x14, x14, x2
         adcs    x15, x15, x3
         stp     x14, x15, [x0, #64]
+
         ldp     x2, x3, [x0, #80]
         adcs    x16, x16, x2
         adcs    x17, x17, x3
         stp     x16, x17, [x0, #80]
+
         ldp     x2, x3, [x0, #96]
         adcs    x2, x2, x19
         adcs    x3, x3, xzr
         stp     x2, x3, [x0, #96]
+
         ldp     x2, x3, [x0, #112]
         adcs    x2, x2, xzr
         adc     x3, x3, xzr
         stp     x2, x3, [x0, #112]
+
         ret
 
 #if defined(__linux__) && defined(__ELF__)
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_32_64.S
similarity index 84%
rename from third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_32_64.S
index 659e00a7919..c54e673c672 100644
--- a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_32_64.S
@@ -680,156 +680,245 @@ bignum_ksqr_32_64_local_ksqr_16_32:
         ret
 
 bignum_ksqr_32_64_local_sqr_8_16:
+// Load registers.
         ldp     x2, x3, [x1]
+ldr     q20, [x1]
         ldp     x4, x5, [x1, #16]
+ldr     q21, [x1, #16]
         ldp     x6, x7, [x1, #32]
+ldr     q22, [x1, #32]
         ldp     x8, x9, [x1, #48]
+ldr     q23, [x1, #48]
+movi    v30.2d, #0xffffffff
+
         mul     x17, x2, x4
         mul     x14, x3, x5
+
+// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8
+// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3)
+ext     v1.16b, v20.16b, v20.16b, #8
         umulh   x20, x2, x4
+shrn    v2.2s, v20.2d, #32
         subs    x21, x2, x3
-        cneg    x21, x21, cc
-        csetm   x11, cc
+zip1    v0.2s, v20.2s, v1.2s
+        cneg    x21, x21, cc  // cc = lo, ul, last
+umull   v5.2d, v2.2s, v2.2s
+        csetm   x11, cc  // cc = lo, ul, last
+umull   v6.2d, v2.2s, v0.2s
         subs    x12, x5, x4
-        cneg    x12, x12, cc
+umull   v3.2d, v0.2s, v0.2s
+        cneg    x12, x12, cc  // cc = lo, ul, last
+mov     v1.16b, v6.16b
         mul     x13, x21, x12
+usra    v1.2d, v3.2d, #32
         umulh   x12, x21, x12
-        cinv    x11, x11, cc
+and     v4.16b, v1.16b, v30.16b
+        cinv    x11, x11, cc  // cc = lo, ul, last
+add     v4.2d, v4.2d, v6.2d
         eor     x13, x13, x11
+usra    v5.2d, v4.2d, #32
         eor     x12, x12, x11
+sli     v3.2d, v4.2d, #32
         adds    x19, x17, x20
+usra    v5.2d, v1.2d, #32
         adc     x20, x20, xzr
+  // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5)
+  ext   v1.16b, v21.16b, v21.16b, #8
         umulh   x21, x3, x5
+  shrn  v2.2s, v21.2d, #32
         adds    x19, x19, x14
+  zip1  v0.2s, v21.2s, v1.2s
         adcs    x20, x20, x21
         adc     x21, x21, xzr
         adds    x20, x20, x14
         adc     x21, x21, xzr
         cmn     x11, #0x1
         adcs    x19, x19, x13
+mov     x13, v3.d[1] // mul     x13, x3, x3
         adcs    x20, x20, x12
+mov     x14, v5.d[1] // umulh   x14, x3, x3
         adc     x21, x21, x11
+mov     x12, v3.d[0] // mul     x12, x2, x2
         adds    x17, x17, x17
+mov     x11, v5.d[0] // umulh   x11, x2, x2
         adcs    x19, x19, x19
+  umull v5.2d, v2.2s, v2.2s
         adcs    x20, x20, x20
+  umull v6.2d, v2.2s, v0.2s
         adcs    x21, x21, x21
+  umull v3.2d, v0.2s, v0.2s
         adc     x10, xzr, xzr
-        mul     x12, x2, x2
-        mul     x13, x3, x3
+  mov   v1.16b, v6.16b
+
         mul     x15, x2, x3
-        umulh   x11, x2, x2
-        umulh   x14, x3, x3
+  usra  v1.2d, v3.2d, #32
         umulh   x16, x2, x3
+  and   v4.16b, v1.16b, v30.16b
         adds    x11, x11, x15
+  add   v4.2d, v4.2d, v6.2d
         adcs    x13, x13, x16
+  usra  v5.2d, v4.2d, #32
         adc     x14, x14, xzr
+  sli   v3.2d, v4.2d, #32
         adds    x11, x11, x15
+  usra  v5.2d, v1.2d, #32
         adcs    x13, x13, x16
         adc     x14, x14, xzr
         stp     x12, x11, [x0]
+  mov   x11, v5.d[0] // umulh   x11, x4, x4
         adds    x17, x17, x13
+  mov   x13, v3.d[1] // mul     x13, x5, x5
         adcs    x19, x19, x14
+  mov   x14, v5.d[1] // umulh   x14, x5, x5
         adcs    x20, x20, xzr
+  mov   x12, v3.d[0] // mul     x12, x4, x4
         adcs    x21, x21, xzr
+// NEON: prepare muls in the upper half
+ext     v1.16b, v22.16b, v22.16b, #8
         adc     x10, x10, xzr
+shrn    v2.2s, v22.2d, #32
         stp     x17, x19, [x0, #16]
-        mul     x12, x4, x4
-        mul     x13, x5, x5
+zip1    v0.2s, v22.2s, v1.2s
         mul     x15, x4, x5
-        umulh   x11, x4, x4
-        umulh   x14, x5, x5
+umull   v5.2d, v2.2s, v2.2s
         umulh   x16, x4, x5
+umull   v6.2d, v2.2s, v0.2s
         adds    x11, x11, x15
+umull   v3.2d, v0.2s, v0.2s
         adcs    x13, x13, x16
+mov     v1.16b, v6.16b
         adc     x14, x14, xzr
+usra    v1.2d, v3.2d, #32
         adds    x11, x11, x15
+and     v4.16b, v1.16b, v30.16b
         adcs    x13, x13, x16
+add     v4.2d, v4.2d, v6.2d
         adc     x14, x14, xzr
+usra    v5.2d, v4.2d, #32
         adds    x12, x12, x20
+sli     v3.2d, v4.2d, #32
         adcs    x11, x11, x21
+usra    v5.2d, v1.2d, #32
         stp     x12, x11, [x0, #32]
+  // NEON: prepare muls in the upper half
+  ext   v1.16b, v23.16b, v23.16b, #8
         adcs    x13, x13, x10
+  shrn  v2.2s, v23.2d, #32
         adc     x14, x14, xzr
+  zip1  v0.2s, v23.2s, v1.2s
         stp     x13, x14, [x0, #48]
+
+// Scalar: square the upper half with a slight variant of the previous block
         mul     x17, x6, x8
+  umull v16.2d, v2.2s, v2.2s
         mul     x14, x7, x9
+  umull v6.2d, v2.2s, v0.2s
         umulh   x20, x6, x8
+  umull v18.2d, v0.2s, v0.2s
         subs    x21, x6, x7
-        cneg    x21, x21, cc
-        csetm   x11, cc
+        cneg    x21, x21, cc  // cc = lo, ul, last
+  mov   v1.16b, v6.16b
+        csetm   x11, cc  // cc = lo, ul, last
         subs    x12, x9, x8
-        cneg    x12, x12, cc
+        cneg    x12, x12, cc  // cc = lo, ul, last
+  usra  v1.2d, v18.2d, #32
         mul     x13, x21, x12
+  and   v4.16b, v1.16b, v30.16b
         umulh   x12, x21, x12
-        cinv    x11, x11, cc
+  add   v4.2d, v4.2d, v6.2d
+        cinv    x11, x11, cc  // cc = lo, ul, last
         eor     x13, x13, x11
         eor     x12, x12, x11
+  usra  v16.2d, v4.2d, #32
         adds    x19, x17, x20
         adc     x20, x20, xzr
+  sli   v18.2d, v4.2d, #32
         umulh   x21, x7, x9
         adds    x19, x19, x14
         adcs    x20, x20, x21
         adc     x21, x21, xzr
         adds    x20, x20, x14
+mov     x14, v5.d[1]
         adc     x21, x21, xzr
         cmn     x11, #0x1
         adcs    x19, x19, x13
+mov     x13, v3.d[1]
         adcs    x20, x20, x12
+mov     x12, v3.d[0]
         adc     x21, x21, x11
+mov     x11, v5.d[0]
         adds    x17, x17, x17
         adcs    x19, x19, x19
+  usra  v16.2d, v1.2d, #32
         adcs    x20, x20, x20
         adcs    x21, x21, x21
         adc     x10, xzr, xzr
-        mul     x12, x6, x6
-        mul     x13, x7, x7
+// NEON: two mul+umulhs for the next stage
+uzp2    v17.4s, v21.4s, v23.4s
         mul     x15, x6, x7
-        umulh   x11, x6, x6
-        umulh   x14, x7, x7
+xtn     v4.2s, v23.2d
         umulh   x16, x6, x7
+  mov   x22, v16.d[0]
         adds    x11, x11, x15
         adcs    x13, x13, x16
+xtn     v5.2s, v21.2d
         adc     x14, x14, xzr
         adds    x11, x11, x15
+rev64   v1.4s, v21.4s
         adcs    x13, x13, x16
         adc     x14, x14, xzr
         stp     x12, x11, [x0, #64]
         adds    x17, x17, x13
+  mov   x13, v18.d[1]
         adcs    x19, x19, x14
+  mov   x14, v16.d[1]
         adcs    x20, x20, xzr
+  mov   x12, v18.d[0]
         adcs    x21, x21, xzr
         adc     x10, x10, xzr
+umull   v6.2d, v4.2s, v5.2s
         stp     x17, x19, [x0, #80]
-        mul     x12, x8, x8
-        mul     x13, x9, x9
+umull   v7.2d, v4.2s, v17.2s
         mul     x15, x8, x9
-        umulh   x11, x8, x8
-        umulh   x14, x9, x9
+uzp2    v16.4s, v23.4s, v23.4s
         umulh   x16, x8, x9
-        adds    x11, x11, x15
+mul     v0.4s, v1.4s, v23.4s
+        adds    x11, x22, x15
         adcs    x13, x13, x16
+usra    v7.2d, v6.2d, #32
         adc     x14, x14, xzr
         adds    x11, x11, x15
+umull   v1.2d, v16.2s, v17.2s
         adcs    x13, x13, x16
         adc     x14, x14, xzr
+uaddlp  v0.2d, v0.4s
         adds    x12, x12, x20
         adcs    x11, x11, x21
+and     v2.16b, v7.16b, v30.16b
+umlal   v2.2d, v16.2s, v5.2s
+shl     v0.2d, v0.2d, #32
+usra    v1.2d, v7.2d, #32
+umlal   v0.2d, v4.2s, v5.2s
+mov     x16, v0.d[1]
+mov     x15, v0.d[0]
+usra    v1.2d, v2.2d, #32
+mov     x20, v1.d[0]
+mov     x21, v1.d[1]
         stp     x12, x11, [x0, #96]
         adcs    x13, x13, x10
         adc     x14, x14, xzr
         stp     x13, x14, [x0, #112]
+
+// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0]
+
         mul     x10, x2, x6
         mul     x14, x3, x7
-        mul     x15, x4, x8
-        mul     x16, x5, x9
         umulh   x17, x2, x6
         adds    x14, x14, x17
         umulh   x17, x3, x7
         adcs    x15, x15, x17
-        umulh   x17, x4, x8
-        adcs    x16, x16, x17
-        umulh   x17, x5, x9
-        adc     x17, x17, xzr
+        adcs    x16, x16, x20
+        adc     x17, x21, xzr
         adds    x11, x14, x10
         adcs    x14, x15, x14
         adcs    x15, x16, x15
@@ -842,13 +931,13 @@ bignum_ksqr_32_64_local_sqr_8_16:
         adcs    x16, xzr, x16
         adc     x17, xzr, x17
         subs    x22, x4, x5
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x9, x8
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x15, x15, x21
@@ -856,13 +945,13 @@ bignum_ksqr_32_64_local_sqr_8_16:
         adcs    x16, x16, x20
         adc     x17, x17, x19
         subs    x22, x2, x3
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x7, x6
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x11, x11, x21
@@ -874,13 +963,13 @@ bignum_ksqr_32_64_local_sqr_8_16:
         adcs    x16, x16, x19
         adc     x17, x17, x19
         subs    x22, x3, x5
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x9, x7
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x14, x14, x21
@@ -889,13 +978,13 @@ bignum_ksqr_32_64_local_sqr_8_16:
         adcs    x16, x16, x19
         adc     x17, x17, x19
         subs    x22, x2, x4
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x8, x6
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x12, x12, x21
@@ -906,13 +995,13 @@ bignum_ksqr_32_64_local_sqr_8_16:
         adcs    x16, x16, x19
         adc     x17, x17, x19
         subs    x22, x2, x5
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x9, x6
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x13, x13, x21
@@ -922,13 +1011,13 @@ bignum_ksqr_32_64_local_sqr_8_16:
         adcs    x16, x16, x19
         adc     x17, x17, x19
         subs    x22, x3, x4
-        cneg    x22, x22, cc
-        csetm   x19, cc
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
         subs    x20, x8, x7
-        cneg    x20, x20, cc
+        cneg    x20, x20, cc  // cc = lo, ul, last
         mul     x21, x22, x20
         umulh   x20, x22, x20
-        cinv    x19, x19, cc
+        cinv    x19, x19, cc  // cc = lo, ul, last
         cmn     x19, #0x1
         eor     x21, x21, x19
         adcs    x13, x13, x21
@@ -946,30 +1035,39 @@ bignum_ksqr_32_64_local_sqr_8_16:
         adcs    x16, x16, x16
         adcs    x17, x17, x17
         adc     x19, xzr, xzr
+
+// Add it back to the buffer
+
         ldp     x2, x3, [x0, #32]
         adds    x10, x10, x2
         adcs    x11, x11, x3
         stp     x10, x11, [x0, #32]
+
         ldp     x2, x3, [x0, #48]
         adcs    x12, x12, x2
         adcs    x13, x13, x3
         stp     x12, x13, [x0, #48]
+
         ldp     x2, x3, [x0, #64]
         adcs    x14, x14, x2
         adcs    x15, x15, x3
         stp     x14, x15, [x0, #64]
+
         ldp     x2, x3, [x0, #80]
         adcs    x16, x16, x2
         adcs    x17, x17, x3
         stp     x16, x17, [x0, #80]
+
         ldp     x2, x3, [x0, #96]
         adcs    x2, x2, x19
         adcs    x3, x3, xzr
         stp     x2, x3, [x0, #96]
+
         ldp     x2, x3, [x0, #112]
         adcs    x2, x2, xzr
         adc     x3, x3, xzr
         stp     x2, x3, [x0, #112]
+
         ret
 
 #if defined(__linux__) && defined(__ELF__)
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8.S
new file mode 100644
index 00000000000..11f57583bf5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8.S
@@ -0,0 +1,252 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+//    extern void bignum_mul_4_8
+//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a0short w3
+#define a1 x4
+#define b0 x5
+#define b0short w5
+#define b1 x6
+
+#define u0 x7
+#define u1 x8
+#define u2 x9
+#define u3 x10
+#define u4 x11
+#define u5 x12
+#define u6 x13
+#define u7 x14
+
+#define t  x15
+
+#define sgn x16
+#define ysgn x17
+
+// These are aliases to registers used elsewhere including input pointers.
+// By the time they are used this does not conflict with other uses.
+
+#define m0 y
+#define m1 ysgn
+#define m2 t
+#define m3 x
+#define u u2
+
+S2N_BN_SYMBOL(bignum_mul_4_8):
+
+// Multiply the low halves using Karatsuba 2x2->4 to get [u3,u2,u1,u0]
+// The zeroth multiplication (only) is done via 32-bit breakdowns
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        umull   u0, a0short, b0short
+        lsr     x17, a0, #32
+        umull   x15, w17, b0short
+        lsr     x16, b0, #32
+        umull   u1, w16, w17
+        umull   x16, a0short, w16
+        adds    u0, u0, x15, lsl #32
+        lsr     x15, x15, #32
+        adc     u1, u1, x15
+        adds    u0, u0, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     u1, u1, x16
+
+        mul     u2, a1, b1
+        umulh   u3, a1, b1
+
+        subs    a1, a1, a0
+        cneg    a1, a1, cc
+        csetm   sgn, cc
+
+        adds    u2, u2, u1
+        adc     u3, u3, xzr
+
+        subs    a0, b0, b1
+        cneg    a0, a0, cc
+        cinv    sgn, sgn, cc
+
+        mul     t, a1, a0
+        umulh   a0, a1, a0
+
+        adds    u1, u0, u2
+        adcs    u2, u2, u3
+        adc     u3, u3, xzr
+
+        adds    xzr, sgn, #1
+        eor     t, t, sgn
+        adcs    u1, t, u1
+        eor     a0, a0, sgn
+        adcs    u2, a0, u2
+        adc     u3, u3, sgn
+
+// Multiply the high halves using Karatsuba 2x2->4 to get [u7,u6,u5,u4]
+// Again, the zeroth multiplication (only) is done via 32-bit breakdowns
+
+        ldp     a0, a1, [x, #16]
+        ldp     b0, b1, [y, #16]
+
+        umull   u4, a0short, b0short
+        lsr     x17, a0, #32
+        umull   x15, w17, b0short
+        lsr     x16, b0, #32
+        umull   u5, w16, w17
+        umull   x16, a0short, w16
+        adds    u4, u4, x15, lsl #32
+        lsr     x15, x15, #32
+        adc     u5, u5, x15
+        adds    u4, u4, x16, lsl #32
+        lsr     x16, x16, #32
+        adc     u5, u5, x16
+
+        mul     u6, a1, b1
+        umulh   u7, a1, b1
+
+        subs    a1, a1, a0
+        cneg    a1, a1, cc
+        csetm   sgn, cc
+
+        adds    u6, u6, u5
+        adc     u7, u7, xzr
+
+        subs    a0, b0, b1
+        cneg    a0, a0, cc
+        cinv    sgn, sgn, cc
+
+        mul     t, a1, a0
+        umulh   a0, a1, a0
+
+        adds    u5, u4, u6
+        adcs    u6, u6, u7
+        adc     u7, u7, xzr
+
+        adds    xzr, sgn, #1
+        eor     t, t, sgn
+        adcs    u5, t, u5
+        eor     a0, a0, sgn
+        adcs    u6, a0, u6
+        adc     u7, u7, sgn
+
+// Compute  sgn,[a1,a0] = x_hi - x_lo
+// and     ysgn,[b1,b0] = y_lo - y_hi
+// sign-magnitude differences
+
+        ldp     a0, a1, [x, #16]
+        ldp     t, sgn, [x]
+        subs    a0, a0, t
+        sbcs    a1, a1, sgn
+        csetm   sgn, cc
+
+        ldp     t, ysgn, [y]
+        subs    b0, t, b0
+        sbcs    b1, ysgn, b1
+        csetm   ysgn, cc
+
+        eor     a0, a0, sgn
+        subs    a0, a0, sgn
+        eor     a1, a1, sgn
+        sbc     a1, a1, sgn
+
+        eor     b0, b0, ysgn
+        subs    b0, b0, ysgn
+        eor     b1, b1, ysgn
+        sbc     b1, b1, ysgn
+
+// Save the correct sign for the sub-product
+
+        eor     sgn, ysgn, sgn
+
+// Add H' = H + L_top, still in [u7,u6,u5,u4]
+
+        adds    u4, u4, u2
+        adcs    u5, u5, u3
+        adcs    u6, u6, xzr
+        adc     u7, u7, xzr
+
+// Now compute the mid-product as [m3,m2,m1,m0]
+
+        mul     m0, a0, b0
+        umulh   m1, a0, b0
+        mul     m2, a1, b1
+        umulh   m3, a1, b1
+
+        subs    a1, a1, a0
+        cneg    a1, a1, cc
+        csetm   u, cc
+
+        adds    m2, m2, m1
+        adc     m3, m3, xzr
+
+        subs    b1, b0, b1
+        cneg    b1, b1, cc
+        cinv    u, u, cc
+
+        mul     b0, a1, b1
+        umulh   b1, a1, b1
+
+        adds    m1, m0, m2
+        adcs    m2, m2, m3
+        adc     m3, m3, xzr
+
+        adds    xzr, u, #1
+        eor     b0, b0, u
+        adcs    m1, b0, m1
+        eor     b1, b1, u
+        adcs    m2, b1, m2
+        adc     m3, m3, u
+
+// Accumulate the positive mid-terms as [u7,u6,u5,u4,u3,u2]
+
+        adds    u2, u4, u0
+        adcs    u3, u5, u1
+        adcs    u4, u6, u4
+        adcs    u5, u7, u5
+        adcs    u6, u6, xzr
+        adc     u7, u7, xzr
+
+// Add in the sign-adjusted complex term
+
+        adds    xzr, sgn, #1
+        eor     m0, m0, sgn
+        adcs    u2, m0, u2
+        eor     m1, m1, sgn
+        adcs    u3, m1, u3
+        eor     m2, m2, sgn
+        adcs    u4, m2, u4
+        eor     m3, m3, sgn
+        adcs    u5, m3, u5
+        adcs    u6, u6, sgn
+        adc     u7, u7, sgn
+
+// Store back the result
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+        stp     u4, u5, [z, #32]
+        stp     u6, u7, [z, #48]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8_alt.S
new file mode 100644
index 00000000000..b082b8011dd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8_alt.S
@@ -0,0 +1,147 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+//    extern void bignum_mul_4_8_alt
+//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define t x11
+
+#define u0 x12
+#define u1 x13
+#define u2 x14
+#define u3 x15
+#define u4 x16
+
+// These alias to the input arguments when no longer needed
+
+#define u5 a0
+#define u6 a1
+#define u7 a2
+
+S2N_BN_SYMBOL(bignum_mul_4_8_alt):
+
+// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        mul     u0, a0, b0
+        umulh   u1, a0, b0
+        mul     t, a0, b1
+        umulh   u2, a0, b1
+        adds    u1, u1, t
+
+        ldp     b2, b3, [y, #16]
+
+        mul     t, a0, b2
+        umulh   u3, a0, b2
+        adcs    u2, u2, t
+
+        mul     t, a0, b3
+        umulh   u4, a0, b3
+        adcs    u3, u3, t
+        adc     u4, u4, xzr
+
+        ldp     a2, a3, [x, #16]
+
+// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0]
+
+        mul     t, a1, b0
+        adds    u1, u1, t
+        mul     t, a1, b1
+        adcs    u2, u2, t
+        mul     t, a1, b2
+        adcs    u3, u3, t
+        mul     t, a1, b3
+        adcs    u4, u4, t
+        umulh   u5, a1, b3
+        adc     u5, u5, xzr
+
+        umulh   t, a1, b0
+        adds    u2, u2, t
+        umulh   t, a1, b1
+        adcs    u3, u3, t
+        umulh   t, a1, b2
+        adcs    u4, u4, t
+        adc     u5, u5, xzr
+
+// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0]
+
+        mul     t, a2, b0
+        adds    u2, u2, t
+        mul     t, a2, b1
+        adcs    u3, u3, t
+        mul     t, a2, b2
+        adcs    u4, u4, t
+        mul     t, a2, b3
+        adcs    u5, u5, t
+        umulh   u6, a2, b3
+        adc     u6, u6, xzr
+
+        umulh   t, a2, b0
+        adds    u3, u3, t
+        umulh   t, a2, b1
+        adcs    u4, u4, t
+        umulh   t, a2, b2
+        adcs    u5, u5, t
+        adc     u6, u6, xzr
+
+// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0]
+
+        mul     t, a3, b0
+        adds    u3, u3, t
+        mul     t, a3, b1
+        adcs    u4, u4, t
+        mul     t, a3, b2
+        adcs    u5, u5, t
+        mul     t, a3, b3
+        adcs    u6, u6, t
+        umulh   u7, a3, b3
+        adc     u7, u7, xzr
+
+        umulh   t, a3, b0
+        adds    u4, u4, t
+        umulh   t, a3, b1
+        adcs    u5, u5, t
+        umulh   t, a3, b2
+        adcs    u6, u6, t
+        adc     u7, u7, xzr
+
+// Store back final result [a3;...a0] * [b3;...;b0] = a * b
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+        stp     u4, u5, [z, #32]
+        stp     u6, u7, [z, #48]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12.S
new file mode 100644
index 00000000000..b32b19102e6
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12.S
@@ -0,0 +1,278 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12
+//     (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Macro computing [c,b,a] := [b,a] + (x - y) * (w - z), adding with carry
+// to the [b,a] components but leaving CF aligned with the c term, which is
+// a sign bitmask for (x - y) * (w - z). Continued add-with-carry operations
+// with [c,...,c] will continue the carry chain correctly starting from
+// the c position if desired to add to a longer term of the form [...,b,a].
+//
+// c,h,l,t should all be different and t,h should not overlap w,z.
+// ---------------------------------------------------------------------------
+
+.macro muldiffnadd b,a, c,h,l,t, x,y, w,z
+        subs    \t, \x, \y
+        cneg    \t, \t, cc
+        csetm   \c, cc
+        subs    \h, \w, \z
+        cneg    \h, \h, cc
+        mul     \l, \t, \h
+        umulh   \h, \t, \h
+        cinv    \c, \c, cc
+        adds    xzr, \c, #1
+        eor     \l, \l, \c
+        adcs    \a, \a, \l
+        eor     \h, \h, \c
+        adcs    \b, \b, \h
+.endm
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define b0 x6
+#define b1 x7
+#define b2 x8
+#define l0 x9
+#define l1 x10
+#define l2 x11
+#define h0 x12
+#define h1 x13
+#define h2 x14
+
+#define s1 x15
+#define s2 x16
+#define s3 x17
+#define s4 x19
+#define s5 x9
+
+#define c  x10
+#define h  x11
+#define l  x12
+#define t  x13
+
+#define s0 x20
+
+#define u0      x3
+#define u1      x4
+#define u2      x5
+#define u3      x6
+#define u4      x7
+#define u5      x8
+
+// These alias c,h,l but it doesn't matter
+
+#define u6      x10
+#define u7      x11
+#define u8      x12
+
+// We recycle the input pointers near the end
+
+#define s x1
+#define d x2
+
+// ---------------------------------------------------------------------------
+// Core 3x3->6 ADK multiplication macro
+// Does [s5,s4,s3,s2,s1,s0] = [a2,a1,a0] * [b2,b1,b0]
+//
+// If the input parameter is 1, it also adds in [z+24,z+32,z+40]
+// existing contents; if the parameter is 0 it just does the pure multiply
+// ---------------------------------------------------------------------------
+
+.macro  mul3 afl
+        mul     s0, a0, b0
+        mul     l1, a1, b1
+        mul     l2, a2, b2
+        umulh   h0, a0, b0
+        umulh   h1, a1, b1
+        umulh   h2, a2, b2
+
+        adds    h0, h0, l1
+        adcs    h1, h1, l2
+        adc     h2, h2, xzr
+
+        adds    s1, h0, s0
+        adcs    s2, h1, h0
+        adcs    s3, h2, h1
+        adc     s4, h2, xzr
+
+        adds    s2, s2, s0
+        adcs    s3, s3, h0
+        adcs    s4, s4, h1
+        adc     s5, h2, xzr
+
+// Optionally add the existing z contents
+
+.rep \afl
+        ldr     l, [z,#24]
+        adds    s0, s0, l
+        ldp     l, h, [z,#32]
+        adcs    s1, s1, l
+        adcs    s2, s2, h
+        adcs    s3, s3, xzr
+        adcs    s4, s4, xzr
+        adc     s5, s5, xzr
+.endr
+
+        muldiffnadd s2,s1, c,h,l, t, a0,a1, b1,b0
+        adcs    s3, s3, c
+        adcs    s4, s4, c
+        adc     s5, s5, c
+
+        muldiffnadd s3,s2, c,h,l, t, a0,a2, b2,b0
+        adcs    s4, s4, c
+        adc     s5, s5, c
+
+        muldiffnadd s4,s3, c,h,l, t, a1,a2, b2,b1
+        adc     s5, s5, c
+.endm
+
+S2N_BN_SYMBOL(bignum_mul_6_12):
+
+stp     x19, x20, [sp, #-16]!
+
+// Multiply the low halves using ADK 3x3->6
+
+        ldp     a0, a1, [x1]
+        ldp     b0, b1, [x2]
+        ldr     a2, [x1, #16]
+        ldr     b2, [x2, #16]
+
+        mul3    0
+        stp     s0, s1, [x0]
+        stp     s2, s3, [x0, #16]
+        stp     s4, s5, [x0, #32]
+
+// Multiply the high halves using ADK 3x3->6
+
+        ldp     a0, a1, [x1,#24]
+        ldp     b0, b1, [x2,#24]
+        ldr     a2, [x1, #40]
+        ldr     b2, [x2, #40]
+
+        mul3    1
+
+        stp     s0, s1, [x0, #48]
+        stp     s2, s3, [x0, #64]
+        stp     s4, s5, [x0, #80]
+
+// Compute t,[a2,a1,a0] = x_hi - x_lo
+// and     s,[b2,b1,b0] = y_lo - y_hi
+// sign-magnitude differences
+
+        ldr     t, [x1]
+        subs    a0, a0, t
+        ldr     t, [x1,#8]
+        sbcs    a1, a1, t
+        ldr     t, [x1,#16]
+        sbcs    a2, a2, t
+        csetm   t, cc
+
+        ldr     s, [x2]
+        subs    b0, s, b0
+        ldr     s, [x2,#8]
+        sbcs    b1, s, b1
+        ldr     s, [x2,#16]
+        sbcs    b2, s, b2
+        csetm   s, cc
+
+        eor     a0, a0, t
+        subs    a0, a0, t
+        eor     a1, a1, t
+        sbcs    a1, a1, t
+        eor     a2, a2, t
+        sbc     a2, a2, t
+
+        eor     b0, b0, s
+        subs    b0, b0, s
+        eor     b1, b1, s
+        sbcs    b1, b1, s
+        eor     b2, b2, s
+        sbc     b2, b2, s
+
+// Save the correct sign for the sub-product
+
+        eor     s, s, t
+
+// Now yet another 3x3->6 ADK core, but not writing back, keeping s0..s5
+
+        mul3    0
+
+// Now accumulate the positive mid-terms as [u5,u4,u3,u2,u1,u0]
+
+        ldp     u0, u1, [z]
+        ldp     u3, u4, [z,#48]
+        adds    u0, u0, u3
+        adcs    u1, u1, u4
+        ldr     u2, [z,#16]
+        ldp     u5, u6, [z,#64]
+        adcs    u2, u2, u5
+        adcs    u3, u3, u6
+        ldp     u7, u8, [z,#80]
+        adcs    u4, u4, u7
+        adcs    u5, u5, u8
+
+// Stop the carry here so we can reintroduce it, taking into account the
+// effective addition of s from sign-extension below. Note that we get
+// a duplicated word c+carry beyond the first one, so this upper part is
+// of the form [d,d,t].
+
+        adcs    t, s, xzr
+        adc     d, s, xzr
+
+// Add in the sign-adjusted complex term
+
+        adds    xzr, s, #1
+        eor     s0, s0, s
+        adcs    u0, s0, u0
+        eor     s1, s1, s
+        adcs    u1, s1, u1
+        eor     s2, s2, s
+        adcs    u2, s2, u2
+        eor     s3, s3, s
+        adcs    u3, s3, u3
+        eor     s4, s4, s
+        adcs    u4, s4, u4
+        eor     s5, s5, s
+        adcs    u5, s5, u5
+        adcs    u6, u6, t
+        adcs    u7, u7, d
+        adc     u8, u8, d
+
+// Store it back
+
+        str     u0, [x0,#24]
+        stp     u1, u2, [x0,#32]
+        stp     u3, u4, [x0,#48]
+        stp     u5, u6, [x0,#64]
+        stp     u7, u8, [x0,#80]
+
+// Restore regs and return
+
+        ldp     x19, x20, [sp], #16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12_alt.S
new file mode 100644
index 00000000000..72dfd7fcd69
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12_alt.S
@@ -0,0 +1,264 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12_alt
+//     (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+
+// These are repeated mod 2 as we load pairs of inputs
+
+#define a0 x3
+#define a1 x4
+#define a2 x3
+#define a3 x4
+#define a4 x3
+#define a5 x4
+
+#define b0 x5
+#define b1 x6
+#define b2 x7
+#define b3 x8
+#define b4 x9
+#define b5 x10
+
+#define t x11
+
+// These repeat mod 8 as we write back
+
+#define u0 x12
+#define u1 x13
+#define u2 x14
+#define u3 x15
+#define u4 x16
+#define u5 x17
+#define u6 x19
+#define u7 x20
+#define u8 x12
+#define u9 x13
+#define u10 x14
+#define u11 x15
+
+S2N_BN_SYMBOL(bignum_mul_6_12_alt):
+
+// Save more registers
+
+        stp     x19, x20, [sp, #-16]!
+
+// Load operands and set up row 0 = [u6;...;u0] = a0 * [b5;...;b0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        mul     u0, a0, b0
+        umulh   u1, a0, b0
+        mul     t, a0, b1
+        umulh   u2, a0, b1
+        adds    u1, u1, t
+
+        ldp     b2, b3, [y, #16]
+
+        mul     t, a0, b2
+        umulh   u3, a0, b2
+        adcs    u2, u2, t
+
+        mul     t, a0, b3
+        umulh   u4, a0, b3
+        adcs    u3, u3, t
+
+        ldp     b4, b5, [y, #32]
+
+        mul     t, a0, b4
+        umulh   u5, a0, b4
+        adcs    u4, u4, t
+
+        mul     t, a0, b5
+        umulh   u6, a0, b5
+        adcs    u5, u5, t
+
+        adc     u6, u6, xzr
+
+// Row 1 = [u7;...;u0] = [a1;a0] * [b5;...;b0]
+
+        mul     t, a1, b0
+        adds    u1, u1, t
+        mul     t, a1, b1
+        adcs    u2, u2, t
+        mul     t, a1, b2
+        adcs    u3, u3, t
+        mul     t, a1, b3
+        adcs    u4, u4, t
+        mul     t, a1, b4
+        adcs    u5, u5, t
+        mul     t, a1, b5
+        adcs    u6, u6, t
+        cset    u7, cs
+
+        umulh   t, a1, b0
+        adds    u2, u2, t
+        umulh   t, a1, b1
+        adcs    u3, u3, t
+        umulh   t, a1, b2
+        adcs    u4, u4, t
+        umulh   t, a1, b3
+        adcs    u5, u5, t
+        umulh   t, a1, b4
+        adcs    u6, u6, t
+        umulh   t, a1, b5
+        adc     u7, u7, t
+
+        stp     u0, u1, [z]
+
+// Row 2 = [u8;...;u0] = [a2;a1;a0] * [b5;...;b0]
+
+        ldp     a2, a3, [x, #16]
+
+        mul     t, a2, b0
+        adds    u2, u2, t
+        mul     t, a2, b1
+        adcs    u3, u3, t
+        mul     t, a2, b2
+        adcs    u4, u4, t
+        mul     t, a2, b3
+        adcs    u5, u5, t
+        mul     t, a2, b4
+        adcs    u6, u6, t
+        mul     t, a2, b5
+        adcs    u7, u7, t
+        cset    u8, cs
+
+        umulh   t, a2, b0
+        adds    u3, u3, t
+        umulh   t, a2, b1
+        adcs    u4, u4, t
+        umulh   t, a2, b2
+        adcs    u5, u5, t
+        umulh   t, a2, b3
+        adcs    u6, u6, t
+        umulh   t, a2, b4
+        adcs    u7, u7, t
+        umulh   t, a2, b5
+        adc     u8, u8, t
+
+// Row 3 = [u9;...;u0] = [a3;a2;a1;a0] * [b5;...;b0]
+
+        mul     t, a3, b0
+        adds    u3, u3, t
+        mul     t, a3, b1
+        adcs    u4, u4, t
+        mul     t, a3, b2
+        adcs    u5, u5, t
+        mul     t, a3, b3
+        adcs    u6, u6, t
+        mul     t, a3, b4
+        adcs    u7, u7, t
+        mul     t, a3, b5
+        adcs    u8, u8, t
+        cset    u9, cs
+
+        umulh   t, a3, b0
+        adds    u4, u4, t
+        umulh   t, a3, b1
+        adcs    u5, u5, t
+        umulh   t, a3, b2
+        adcs    u6, u6, t
+        umulh   t, a3, b3
+        adcs    u7, u7, t
+        umulh   t, a3, b4
+        adcs    u8, u8, t
+        umulh   t, a3, b5
+        adc     u9, u9, t
+
+        stp     u2, u3, [z, #16]
+
+// Row 4 = [u10;...;u0] = [a4;a3;a2;a1;a0] * [b5;...;b0]
+
+        ldp     a4, a5, [x, #32]
+
+        mul     t, a4, b0
+        adds    u4, u4, t
+        mul     t, a4, b1
+        adcs    u5, u5, t
+        mul     t, a4, b2
+        adcs    u6, u6, t
+        mul     t, a4, b3
+        adcs    u7, u7, t
+        mul     t, a4, b4
+        adcs    u8, u8, t
+        mul     t, a4, b5
+        adcs    u9, u9, t
+        cset    u10, cs
+
+        umulh   t, a4, b0
+        adds    u5, u5, t
+        umulh   t, a4, b1
+        adcs    u6, u6, t
+        umulh   t, a4, b2
+        adcs    u7, u7, t
+        umulh   t, a4, b3
+        adcs    u8, u8, t
+        umulh   t, a4, b4
+        adcs    u9, u9, t
+        umulh   t, a4, b5
+        adc     u10, u10, t
+
+// Row 5 = [u11;...;u0] = [a5;a4;a3;a2;a1;a0] * [b5;...;b0]
+
+        mul     t, a5, b0
+        adds    u5, u5, t
+        mul     t, a5, b1
+        adcs    u6, u6, t
+        mul     t, a5, b2
+        adcs    u7, u7, t
+        mul     t, a5, b3
+        adcs    u8, u8, t
+        mul     t, a5, b4
+        adcs    u9, u9, t
+        mul     t, a5, b5
+        adcs    u10, u10, t
+        cset    u11, cs
+
+        umulh   t, a5, b0
+        adds    u6, u6, t
+        umulh   t, a5, b1
+        adcs    u7, u7, t
+        umulh   t, a5, b2
+        adcs    u8, u8, t
+        umulh   t, a5, b3
+        adcs    u9, u9, t
+        umulh   t, a5, b4
+        adcs    u10, u10, t
+        umulh   t, a5, b5
+        adc     u11, u11, t
+
+        stp     u4, u5, [z, #32]
+
+// Store back remaining digits of final result
+
+        stp     u6, u7, [z, #48]
+        stp     u8, u9, [z, #64]
+        stp     u10, u11, [z, #80]
+
+// Restore registers and return
+
+        ldp     x19, x20, [sp], #16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16.S
new file mode 100644
index 00000000000..5aa9b386945
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16.S
@@ -0,0 +1,521 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16
+//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16)
+        .text
+        .balign 4
+
+S2N_BN_SYMBOL(bignum_mul_8_16):
+        stp x19, x20, [sp, #-16]!
+        stp x21, x22, [sp, #-16]!
+        stp x23, x24, [sp, #-16]!
+        ldp x3, x4, [x1]
+        ldr q0, [x1]
+        ldp x7, x8, [x2]
+        ldr q1, [x2]
+        ldp x5, x6, [x1, #16]
+        ldr q2, [x1, #16]
+        ldp x9, x10, [x2, #16]
+        ldr q3, [x2, #16]
+        uzp1 v4.4s, v1.4s, v0.4s
+        rev64 v1.4s, v1.4s
+        uzp1 v5.4s, v0.4s, v0.4s
+        mul v0.4s, v1.4s, v0.4s
+        uaddlp v0.2d, v0.4s
+        shl v0.2d, v0.2d, #32
+        umlal v0.2d, v5.2s, v4.2s
+        mov x11, v0.d[0]
+        mov x15, v0.d[1]
+        uzp1 v0.4s, v3.4s, v2.4s
+        rev64 v1.4s, v3.4s
+        uzp1 v3.4s, v2.4s, v2.4s
+        mul v1.4s, v1.4s, v2.4s
+        uaddlp v1.2d, v1.4s
+        shl v1.2d, v1.2d, #32
+        umlal v1.2d, v3.2s, v0.2s
+        mov x16, v1.d[0]
+        mov x17, v1.d[1]
+        ldr q0, [x1, #32]
+        ldr q1, [x2, #32]
+        ldr q2, [x1, #48]
+        ldr q3, [x2, #48]
+        umulh x19, x3, x7
+        adds x15, x15, x19
+        umulh x19, x4, x8
+        adcs x16, x16, x19
+        umulh x19, x5, x9
+        adcs x17, x17, x19
+        umulh x19, x6, x10
+        uzp1 v4.4s, v1.4s, v0.4s
+        rev64 v1.4s, v1.4s
+        uzp1 v5.4s, v0.4s, v0.4s
+        mul v0.4s, v1.4s, v0.4s
+        uaddlp v0.2d, v0.4s
+        shl v0.2d, v0.2d, #32
+        umlal v0.2d, v5.2s, v4.2s
+        adc x19, x19, xzr
+        adds x12, x15, x11
+        adcs x15, x16, x15
+        adcs x16, x17, x16
+        adcs x17, x19, x17
+        adc x19, xzr, x19
+        adds x13, x15, x11
+        adcs x14, x16, x12
+        adcs x15, x17, x15
+        adcs x16, x19, x16
+        adcs x17, xzr, x17
+        adc x19, xzr, x19
+        subs x24, x5, x6
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x10, x9
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x16, x16, x22
+        eor x21, x21, x20
+        adcs x17, x17, x21
+        adc x19, x19, x20
+        subs x24, x3, x4
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x8, x7
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x12, x12, x22
+        eor x21, x21, x20
+        adcs x13, x13, x21
+        adcs x14, x14, x20
+        adcs x15, x15, x20
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x4, x6
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x10, x8
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x15, x15, x22
+        eor x21, x21, x20
+        adcs x16, x16, x21
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x3, x5
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x9, x7
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x13, x13, x22
+        eor x21, x21, x20
+        adcs x14, x14, x21
+        adcs x15, x15, x20
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x3, x6
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x10, x7
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x14, x14, x22
+        eor x21, x21, x20
+        adcs x15, x15, x21
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x4, x5
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x9, x8
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x14, x14, x22
+        eor x21, x21, x20
+        adcs x15, x15, x21
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        ldp x3, x4, [x1, #32]
+        stp x11, x12, [x0]
+        ldp x7, x8, [x2, #32]
+        stp x13, x14, [x0, #16]
+        ldp x5, x6, [x1, #48]
+        stp x15, x16, [x0, #32]
+        ldp x9, x10, [x2, #48]
+        stp x17, x19, [x0, #48]
+        mov x11, v0.d[0]
+        mov x15, v0.d[1]
+        uzp1 v0.4s, v3.4s, v2.4s
+        rev64 v1.4s, v3.4s
+        uzp1 v3.4s, v2.4s, v2.4s
+        mul v1.4s, v1.4s, v2.4s
+        uaddlp v1.2d, v1.4s
+        shl v1.2d, v1.2d, #32
+        umlal v1.2d, v3.2s, v0.2s
+        mov x16, v1.d[0]
+        mov x17, v1.d[1]
+        umulh x19, x3, x7
+        adds x15, x15, x19
+        umulh x19, x4, x8
+        adcs x16, x16, x19
+        umulh x19, x5, x9
+        adcs x17, x17, x19
+        umulh x19, x6, x10
+        adc x19, x19, xzr
+        adds x12, x15, x11
+        adcs x15, x16, x15
+        adcs x16, x17, x16
+        adcs x17, x19, x17
+        adc x19, xzr, x19
+        adds x13, x15, x11
+        adcs x14, x16, x12
+        adcs x15, x17, x15
+        adcs x16, x19, x16
+        adcs x17, xzr, x17
+        adc x19, xzr, x19
+        ldp x22, x21, [x0, #32]
+        adds x11, x11, x22
+        adcs x12, x12, x21
+        ldp x22, x21, [x0, #48]
+        adcs x13, x13, x22
+        adcs x14, x14, x21
+        adcs x15, x15, xzr
+        adcs x16, x16, xzr
+        adcs x17, x17, xzr
+        adc x19, x19, xzr
+        subs x24, x5, x6
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x10, x9
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x16, x16, x22
+        eor x21, x21, x20
+        adcs x17, x17, x21
+        adc x19, x19, x20
+        subs x24, x3, x4
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x8, x7
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x12, x12, x22
+        eor x21, x21, x20
+        adcs x13, x13, x21
+        adcs x14, x14, x20
+        adcs x15, x15, x20
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x4, x6
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x10, x8
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x15, x15, x22
+        eor x21, x21, x20
+        adcs x16, x16, x21
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x3, x5
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x9, x7
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x13, x13, x22
+        eor x21, x21, x20
+        adcs x14, x14, x21
+        adcs x15, x15, x20
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x3, x6
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x10, x7
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x14, x14, x22
+        eor x21, x21, x20
+        adcs x15, x15, x21
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x4, x5
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x9, x8
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x14, x14, x22
+        eor x21, x21, x20
+        adcs x15, x15, x21
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        ldp x22, x21, [x1]
+        subs x3, x3, x22
+        sbcs x4, x4, x21
+        ldp x22, x21, [x1, #16]
+        sbcs x5, x5, x22
+        sbcs x6, x6, x21
+        csetm x24, cc
+        stp x11, x12, [x0, #64]
+        ldp x22, x21, [x2]
+        subs x7, x22, x7
+        sbcs x8, x21, x8
+        ldp x22, x21, [x2, #16]
+        sbcs x9, x22, x9
+        sbcs x10, x21, x10
+        csetm x1, cc
+        stp x13, x14, [x0, #80]
+        eor x3, x3, x24
+        subs x3, x3, x24
+        eor x4, x4, x24
+        sbcs x4, x4, x24
+        eor x5, x5, x24
+        sbcs x5, x5, x24
+        eor x6, x6, x24
+        sbc x6, x6, x24
+        stp x15, x16, [x0, #96]
+        eor x7, x7, x1
+        subs x7, x7, x1
+        eor x8, x8, x1
+        sbcs x8, x8, x1
+        eor x9, x9, x1
+        sbcs x9, x9, x1
+        eor x10, x10, x1
+        sbc x10, x10, x1
+        stp x17, x19, [x0, #112]
+        eor x1, x1, x24
+        mul x11, x3, x7
+        mul x15, x4, x8
+        mul x16, x5, x9
+        mul x17, x6, x10
+        umulh x19, x3, x7
+        adds x15, x15, x19
+        umulh x19, x4, x8
+        adcs x16, x16, x19
+        umulh x19, x5, x9
+        adcs x17, x17, x19
+        umulh x19, x6, x10
+        adc x19, x19, xzr
+        adds x12, x15, x11
+        adcs x15, x16, x15
+        adcs x16, x17, x16
+        adcs x17, x19, x17
+        adc x19, xzr, x19
+        adds x13, x15, x11
+        adcs x14, x16, x12
+        adcs x15, x17, x15
+        adcs x16, x19, x16
+        adcs x17, xzr, x17
+        adc x19, xzr, x19
+        subs x24, x5, x6
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x10, x9
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x16, x16, x22
+        eor x21, x21, x20
+        adcs x17, x17, x21
+        adc x19, x19, x20
+        subs x24, x3, x4
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x8, x7
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x12, x12, x22
+        eor x21, x21, x20
+        adcs x13, x13, x21
+        adcs x14, x14, x20
+        adcs x15, x15, x20
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x4, x6
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x10, x8
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x15, x15, x22
+        eor x21, x21, x20
+        adcs x16, x16, x21
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x3, x5
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x9, x7
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x13, x13, x22
+        eor x21, x21, x20
+        adcs x14, x14, x21
+        adcs x15, x15, x20
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x3, x6
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x10, x7
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x14, x14, x22
+        eor x21, x21, x20
+        adcs x15, x15, x21
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        subs x24, x4, x5
+        cneg x24, x24, cc
+        csetm x20, cc
+        subs x21, x9, x8
+        cneg x21, x21, cc
+        mul x22, x24, x21
+        umulh x21, x24, x21
+        cinv x20, x20, cc
+        cmn x20, #0x1
+        eor x22, x22, x20
+        adcs x14, x14, x22
+        eor x21, x21, x20
+        adcs x15, x15, x21
+        adcs x16, x16, x20
+        adcs x17, x17, x20
+        adc x19, x19, x20
+        ldp x3, x4, [x0]
+        ldp x7, x8, [x0, #64]
+        adds x3, x3, x7
+        adcs x4, x4, x8
+        ldp x5, x6, [x0, #16]
+        ldp x9, x10, [x0, #80]
+        adcs x5, x5, x9
+        adcs x6, x6, x10
+        ldp x20, x21, [x0, #96]
+        adcs x7, x7, x20
+        adcs x8, x8, x21
+        ldp x22, x23, [x0, #112]
+        adcs x9, x9, x22
+        adcs x10, x10, x23
+        adcs x24, x1, xzr
+        adc x2, x1, xzr
+        cmn x1, #0x1
+        eor x11, x11, x1
+        adcs x3, x11, x3
+        eor x12, x12, x1
+        adcs x4, x12, x4
+        eor x13, x13, x1
+        adcs x5, x13, x5
+        eor x14, x14, x1
+        adcs x6, x14, x6
+        eor x15, x15, x1
+        adcs x7, x15, x7
+        eor x16, x16, x1
+        adcs x8, x16, x8
+        eor x17, x17, x1
+        adcs x9, x17, x9
+        eor x19, x19, x1
+        adcs x10, x19, x10
+        adcs x20, x20, x24
+        adcs x21, x21, x2
+        adcs x22, x22, x2
+        adc x23, x23, x2
+        stp x3, x4, [x0, #32]
+        stp x5, x6, [x0, #48]
+        stp x7, x8, [x0, #64]
+        stp x9, x10, [x0, #80]
+        stp x20, x21, [x0, #96]
+        stp x22, x23, [x0, #112]
+        ldp x23, x24, [sp], #16
+        ldp x21, x22, [sp], #16
+        ldp x19, x20, [sp], #16
+        ret
+
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16_alt.S
new file mode 100644
index 00000000000..2d0a80e1c80
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16_alt.S
@@ -0,0 +1,406 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16_alt
+//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+
+// These are repeated mod 2 as we load paris of inputs
+
+#define a0 x3
+#define a1 x4
+#define a2 x3
+#define a3 x4
+#define a4 x3
+#define a5 x4
+#define a6 x3
+#define a7 x4
+
+#define b0 x5
+#define b1 x6
+#define b2 x7
+#define b3 x8
+#define b4 x9
+#define b5 x10
+#define b6 x11
+#define b7 x12
+
+#define t x13
+
+// These repeat mod 10 as we write back
+
+#define u0 x14
+#define u1 x15
+#define u2 x16
+#define u3 x17
+#define u4 x19
+#define u5 x20
+#define u6 x21
+#define u7 x22
+#define u8 x23
+#define u9 x24
+#define u10 x14
+#define u11 x15
+#define u12 x16
+#define u13 x17
+#define u14 x19
+#define u15 x20
+
+S2N_BN_SYMBOL(bignum_mul_8_16_alt):
+
+// Save more registers
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+
+// Load operands and set up row 0 = [u8;...;u0] = a0 * [b7;...;b0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        mul     u0, a0, b0
+        umulh   u1, a0, b0
+        mul     t, a0, b1
+        umulh   u2, a0, b1
+        adds    u1, u1, t
+
+        ldp     b2, b3, [y, #16]
+
+        mul     t, a0, b2
+        umulh   u3, a0, b2
+        adcs    u2, u2, t
+
+        mul     t, a0, b3
+        umulh   u4, a0, b3
+        adcs    u3, u3, t
+
+        ldp     b4, b5, [y, #32]
+
+        mul     t, a0, b4
+        umulh   u5, a0, b4
+        adcs    u4, u4, t
+
+        mul     t, a0, b5
+        umulh   u6, a0, b5
+        adcs    u5, u5, t
+
+        ldp     b6, b7, [y, #48]
+
+        mul     t, a0, b6
+        umulh   u7, a0, b6
+        adcs    u6, u6, t
+
+        mul     t, a0, b7
+        umulh   u8, a0, b7
+        adcs    u7, u7, t
+
+        adc     u8, u8, xzr
+
+// Row 1 = [u9;...;u0] = [a1;a0] * [b7;...;b0]
+
+        mul     t, a1, b0
+        adds    u1, u1, t
+        mul     t, a1, b1
+        adcs    u2, u2, t
+        mul     t, a1, b2
+        adcs    u3, u3, t
+        mul     t, a1, b3
+        adcs    u4, u4, t
+        mul     t, a1, b4
+        adcs    u5, u5, t
+        mul     t, a1, b5
+        adcs    u6, u6, t
+        mul     t, a1, b6
+        adcs    u7, u7, t
+        mul     t, a1, b7
+        adcs    u8, u8, t
+        cset    u9, cs
+
+        umulh   t, a1, b0
+        adds    u2, u2, t
+        umulh   t, a1, b1
+        adcs    u3, u3, t
+        umulh   t, a1, b2
+        adcs    u4, u4, t
+        umulh   t, a1, b3
+        adcs    u5, u5, t
+        umulh   t, a1, b4
+        adcs    u6, u6, t
+        umulh   t, a1, b5
+        adcs    u7, u7, t
+        umulh   t, a1, b6
+        adcs    u8, u8, t
+        umulh   t, a1, b7
+        adc     u9, u9, t
+
+        stp     u0, u1, [z]
+
+// Row 2 = [u10;...;u0] = [a2;a1;a0] * [b7;...;b0]
+
+        ldp     a2, a3, [x, #16]
+
+        mul     t, a2, b0
+        adds    u2, u2, t
+        mul     t, a2, b1
+        adcs    u3, u3, t
+        mul     t, a2, b2
+        adcs    u4, u4, t
+        mul     t, a2, b3
+        adcs    u5, u5, t
+        mul     t, a2, b4
+        adcs    u6, u6, t
+        mul     t, a2, b5
+        adcs    u7, u7, t
+        mul     t, a2, b6
+        adcs    u8, u8, t
+        mul     t, a2, b7
+        adcs    u9, u9, t
+        cset    u10, cs
+
+        umulh   t, a2, b0
+        adds    u3, u3, t
+        umulh   t, a2, b1
+        adcs    u4, u4, t
+        umulh   t, a2, b2
+        adcs    u5, u5, t
+        umulh   t, a2, b3
+        adcs    u6, u6, t
+        umulh   t, a2, b4
+        adcs    u7, u7, t
+        umulh   t, a2, b5
+        adcs    u8, u8, t
+        umulh   t, a2, b6
+        adcs    u9, u9, t
+        umulh   t, a2, b7
+        adc     u10, u10, t
+
+// Row 3 = [u11;...;u0] = [a3;a2;a1;a0] * [b7;...;b0]
+
+        mul     t, a3, b0
+        adds    u3, u3, t
+        mul     t, a3, b1
+        adcs    u4, u4, t
+        mul     t, a3, b2
+        adcs    u5, u5, t
+        mul     t, a3, b3
+        adcs    u6, u6, t
+        mul     t, a3, b4
+        adcs    u7, u7, t
+        mul     t, a3, b5
+        adcs    u8, u8, t
+        mul     t, a3, b6
+        adcs    u9, u9, t
+        mul     t, a3, b7
+        adcs    u10, u10, t
+        cset    u11, cs
+
+        umulh   t, a3, b0
+        adds    u4, u4, t
+        umulh   t, a3, b1
+        adcs    u5, u5, t
+        umulh   t, a3, b2
+        adcs    u6, u6, t
+        umulh   t, a3, b3
+        adcs    u7, u7, t
+        umulh   t, a3, b4
+        adcs    u8, u8, t
+        umulh   t, a3, b5
+        adcs    u9, u9, t
+        umulh   t, a3, b6
+        adcs    u10, u10, t
+        umulh   t, a3, b7
+        adc     u11, u11, t
+
+        stp     u2, u3, [z, #16]
+
+// Row 4 = [u12;...;u0] = [a4;a3;a2;a1;a0] * [b7;...;b0]
+
+        ldp     a4, a5, [x, #32]
+
+        mul     t, a4, b0
+        adds    u4, u4, t
+        mul     t, a4, b1
+        adcs    u5, u5, t
+        mul     t, a4, b2
+        adcs    u6, u6, t
+        mul     t, a4, b3
+        adcs    u7, u7, t
+        mul     t, a4, b4
+        adcs    u8, u8, t
+        mul     t, a4, b5
+        adcs    u9, u9, t
+        mul     t, a4, b6
+        adcs    u10, u10, t
+        mul     t, a4, b7
+        adcs    u11, u11, t
+        cset    u12, cs
+
+        umulh   t, a4, b0
+        adds    u5, u5, t
+        umulh   t, a4, b1
+        adcs    u6, u6, t
+        umulh   t, a4, b2
+        adcs    u7, u7, t
+        umulh   t, a4, b3
+        adcs    u8, u8, t
+        umulh   t, a4, b4
+        adcs    u9, u9, t
+        umulh   t, a4, b5
+        adcs    u10, u10, t
+        umulh   t, a4, b6
+        adcs    u11, u11, t
+        umulh   t, a4, b7
+        adc     u12, u12, t
+
+// Row 5 = [u13;...;u0] = [a5;a4;a3;a2;a1;a0] * [b7;...;b0]
+
+        mul     t, a5, b0
+        adds    u5, u5, t
+        mul     t, a5, b1
+        adcs    u6, u6, t
+        mul     t, a5, b2
+        adcs    u7, u7, t
+        mul     t, a5, b3
+        adcs    u8, u8, t
+        mul     t, a5, b4
+        adcs    u9, u9, t
+        mul     t, a5, b5
+        adcs    u10, u10, t
+        mul     t, a5, b6
+        adcs    u11, u11, t
+        mul     t, a5, b7
+        adcs    u12, u12, t
+        cset    u13, cs
+
+        umulh   t, a5, b0
+        adds    u6, u6, t
+        umulh   t, a5, b1
+        adcs    u7, u7, t
+        umulh   t, a5, b2
+        adcs    u8, u8, t
+        umulh   t, a5, b3
+        adcs    u9, u9, t
+        umulh   t, a5, b4
+        adcs    u10, u10, t
+        umulh   t, a5, b5
+        adcs    u11, u11, t
+        umulh   t, a5, b6
+        adcs    u12, u12, t
+        umulh   t, a5, b7
+        adc     u13, u13, t
+
+        stp     u4, u5, [z, #32]
+
+// Row 6 = [u14;...;u0] = [a6;a5;a4;a3;a2;a1;a0] * [b7;...;b0]
+
+        ldp     a6, a7, [x, #48]
+
+        mul     t, a6, b0
+        adds    u6, u6, t
+        mul     t, a6, b1
+        adcs    u7, u7, t
+        mul     t, a6, b2
+        adcs    u8, u8, t
+        mul     t, a6, b3
+        adcs    u9, u9, t
+        mul     t, a6, b4
+        adcs    u10, u10, t
+        mul     t, a6, b5
+        adcs    u11, u11, t
+        mul     t, a6, b6
+        adcs    u12, u12, t
+        mul     t, a6, b7
+        adcs    u13, u13, t
+        cset    u14, cs
+
+        umulh   t, a6, b0
+        adds    u7, u7, t
+        umulh   t, a6, b1
+        adcs    u8, u8, t
+        umulh   t, a6, b2
+        adcs    u9, u9, t
+        umulh   t, a6, b3
+        adcs    u10, u10, t
+        umulh   t, a6, b4
+        adcs    u11, u11, t
+        umulh   t, a6, b5
+        adcs    u12, u12, t
+        umulh   t, a6, b6
+        adcs    u13, u13, t
+        umulh   t, a6, b7
+        adc     u14, u14, t
+
+// Row 7 = [u15;...;u0] = [a7;a6;a5;a4;a3;a2;a1;a0] * [b7;...;b0]
+
+        mul     t, a7, b0
+        adds    u7, u7, t
+        mul     t, a7, b1
+        adcs    u8, u8, t
+        mul     t, a7, b2
+        adcs    u9, u9, t
+        mul     t, a7, b3
+        adcs    u10, u10, t
+        mul     t, a7, b4
+        adcs    u11, u11, t
+        mul     t, a7, b5
+        adcs    u12, u12, t
+        mul     t, a7, b6
+        adcs    u13, u13, t
+        mul     t, a7, b7
+        adcs    u14, u14, t
+        cset    u15, cs
+
+        umulh   t, a7, b0
+        adds    u8, u8, t
+        umulh   t, a7, b1
+        adcs    u9, u9, t
+        umulh   t, a7, b2
+        adcs    u10, u10, t
+        umulh   t, a7, b3
+        adcs    u11, u11, t
+        umulh   t, a7, b4
+        adcs    u12, u12, t
+        umulh   t, a7, b5
+        adcs    u13, u13, t
+        umulh   t, a7, b6
+        adcs    u14, u14, t
+        umulh   t, a7, b7
+        adc     u15, u15, t
+
+        stp     u6, u7, [z, #48]
+
+// Store back remaining digits of final result
+
+        stp     u8, u9, [z, #64]
+        stp     u10, u11, [z, #80]
+        stp     u12, u13, [z, #96]
+        stp     u14, u15, [z, #112]
+
+// Restore registers
+
+        ldp     x23, x24, [sp], #16
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8.S
new file mode 100644
index 00000000000..e6fb56c6e31
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8.S
@@ -0,0 +1,144 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+//    extern void bignum_sqr_4_8 (uint64_t z[static 8], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// 2x2 squaring macro: [s3;s2;s1;s0] := [a1;a0]^2 with t0,t1,t2 temporaries
+// This uses 32x32->64 multiplications to reduce the number of UMULHs
+// ---------------------------------------------------------------------------
+
+#define sqr2(s3,s2,s1,s0, a1,a1short,a0,a0short, t2,t1,t0,t0short) \
+        umull   s0, a0short, a0short __LF          \
+        lsr     t0, a0, #32 __LF                   \
+        umull   s1, t0short, t0short __LF          \
+        umull   t0, a0short, t0short __LF          \
+        adds    s0, s0, t0, lsl #33 __LF           \
+        lsr     t0, t0, #31 __LF                   \
+        adc     s1, s1, t0 __LF                    \
+        umull   s2, a1short, a1short __LF          \
+        lsr     t0, a1, #32 __LF                   \
+        umull   s3, t0short, t0short __LF          \
+        umull   t0, a1short, t0short __LF          \
+        mul     t1, a0, a1 __LF                    \
+        umulh   t2, a0, a1 __LF                    \
+        adds    s2, s2, t0, lsl #33 __LF           \
+        lsr     t0, t0, #31 __LF                   \
+        adc     s3, s3, t0 __LF                    \
+        adds    t1, t1, t1 __LF                    \
+        adcs    t2, t2, t2 __LF                    \
+        adc     s3, s3, xzr __LF                   \
+        adds    s1, s1, t1 __LF                    \
+        adcs    s2, s2, t2 __LF                    \
+        adc     s3, s3, xzr
+
+// Main code
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+
+#define s0 x6
+#define s1 x7
+#define s2 x8
+#define s3 x9
+#define s4 x10
+#define s5 x11
+#define s6 x12
+#define s7 x13
+
+#define d0 x14
+#define d1 x15
+#define d2 x16
+
+// Short versions
+
+#define a0short w2
+#define a1short w3
+#define a2short w4
+#define a3short w5
+#define d2short w16
+#define s3short w9
+
+S2N_BN_SYMBOL(bignum_sqr_4_8):
+
+// Load all the elements
+
+        ldp     a0, a1, [x]
+        ldp     a2, a3, [x, #16]
+
+// Compute L = [s3;s2;s1;s0] = square of lower half
+
+        sqr2(s3,s2,s1,s0, a1,a1short,a0,a0short, d0,d1,d2,d2short)
+
+// Compute H = [s7;s6;s5;s4] = square of upper half
+
+        sqr2(s7,s6,s5,s4, a3,a3short,a2,a2short, d0,d1,d2,d2short)
+
+// Let [a1;a0] = |[a3;a2] - [a1;a0]| be the absolute difference
+
+        subs    a0, a0, a2
+        sbcs    a1, a1, a3
+        csetm   d0, cc
+        eor     a0, a0, d0
+        subs    a0, a0, d0
+        eor     a1, a1, d0
+        sbc     a1, a1, d0
+
+// Form H' = H + L_hi (which fits in 4 words)
+
+        adds    s4, s4, s2
+        adcs    s5, s5, s3
+        adcs    s6, s6, xzr
+        adc     s7, s7, xzr
+
+// Let M = [d2;d1;a3;a2] = ([a3;a2] - [a1;a0])^2
+
+        sqr2(d2,d1,a3,a2, a1,a1short,a0,a0short, d0,s2,s3,s3short)
+
+// Now form (2^64 + 1) * (H'::L), with a bit of carry-shortening
+
+        adds    s2, s0, s4
+        adcs    s3, s1, s5
+        adcs    s4, s4, s6
+        adcs    s5, s5, s7
+        csetm   d0, cc
+
+// Subtract the middle term M
+
+        subs    s2, s2, a2
+        sbcs    s3, s3, a3
+        sbcs    s4, s4, d1
+        sbcs    s5, s5, d2
+        adcs    s6, s6, d0
+        adc     s7, s7, d0
+
+// Store back
+
+        stp     s0, s1, [z]
+        stp     s2, s3, [z, 16]
+        stp     s4, s5, [z, 32]
+        stp     s6, s7, [z, 48]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8_alt.S
new file mode 100644
index 00000000000..b7e5eed3515
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8_alt.S
@@ -0,0 +1,123 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+//    extern void bignum_sqr_4_8_alt
+//     (uint64_t z[static 8], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+
+#define l x6
+#define h x7
+
+#define u0 x8
+#define u1 x9
+#define u2 x10
+#define u3 x11
+#define u4 x12
+#define u5 x13
+#define u6 x14
+
+// This one is the same as h, which is safe with this computation sequence
+
+#define u7 h
+
+S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
+
+// Load all the elements, set up an initial window [u6;...u1] = [23;03;01]
+// and chain in the addition of 02 + 12 + 13 (no carry-out is possible).
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        ldp     a0, a1, [x]
+
+        mul     u1, a0, a1
+        umulh   u2, a0, a1
+
+        ldp     a2, a3, [x, #16]
+
+        mul     u3, a0, a3
+        umulh   u4, a0, a3
+
+        mul     l, a0, a2
+        umulh   h, a0, a2
+        adds    u2, u2, l
+
+        adcs    u3, u3, h
+        mul     l, a1, a2
+        umulh   h, a1, a2
+        adc     h, h, xzr
+        adds    u3, u3, l
+
+        mul     u5, a2, a3
+        umulh   u6, a2, a3
+
+        adcs    u4, u4, h
+        mul     l, a1, a3
+        umulh   h, a1, a3
+        adc     h, h, xzr
+        adds    u4, u4, l
+
+        adcs    u5, u5, h
+        adc     u6, u6, xzr
+
+// Now just double it; this simple approach seems to work better than extr
+
+        adds    u1, u1, u1
+        adcs    u2, u2, u2
+        adcs    u3, u3, u3
+        adcs    u4, u4, u4
+        adcs    u5, u5, u5
+        adcs    u6, u6, u6
+        cset    u7, cs
+
+// Add the homogeneous terms 00 + 11 + 22 + 33
+
+        umulh   l, a0, a0
+        mul     u0, a0, a0
+        adds    u1, u1, l
+
+        mul     l, a1, a1
+        adcs    u2, u2, l
+        umulh   l, a1, a1
+        adcs    u3, u3, l
+
+        mul     l, a2, a2
+        adcs    u4, u4, l
+        umulh   l, a2, a2
+        adcs    u5, u5, l
+
+        mul     l, a3, a3
+        adcs    u6, u6, l
+        umulh   l, a3, a3
+        adc     u7, u7, l
+
+// Store back final result
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+        stp     u4, u5, [z, #32]
+        stp     u6, u7, [z, #48]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12.S
new file mode 100644
index 00000000000..04e530989d1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12.S
@@ -0,0 +1,261 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12 (uint64_t z[static 12], uint64_t x[static 6]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z)
+// c,h,l,t should all be different
+// t,h should not overlap w,z
+// ---------------------------------------------------------------------------
+
+.macro muldiffn c,h,l, t, x,y, w,z
+        subs    \t, \x, \y
+        cneg    \t, \t, cc
+        csetm   \c, cc
+        subs    \h, \w, \z
+        cneg    \h, \h, cc
+        mul     \l, \t, \h
+        umulh   \h, \t, \h
+        cinv    \c, \c, cc
+        eor     \l, \l, \c
+        eor     \h, \h, \c
+.endm
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+#define a4 x6
+#define a5 x7
+
+#define c0 x8
+#define c1 x9
+#define c2 x10
+#define c3 x11
+#define c4 x12
+#define c5 x13
+#define d1 x14
+#define d2 x15
+#define d3 x16
+#define d4 x17
+
+S2N_BN_SYMBOL(bignum_sqr_6_12):
+
+// Load in all words of the input
+
+        ldp     a0, a1, [x1]
+        ldp     a2, a3, [x1, #16]
+        ldp     a4, a5, [x1, #32]
+
+// Square the low half
+
+        mul     d1, a0, a1
+        mul     d2, a0, a2
+        mul     d3, a1, a2
+        mul     c0, a0, a0
+        str     c0, [x0]
+        mul     c2, a1, a1
+        mul     c4, a2, a2
+
+        umulh   d4, a0, a1
+        adds    d2, d2, d4
+        umulh   d4, a0, a2
+        adcs    d3, d3, d4
+        umulh   d4, a1, a2
+        adcs    d4, d4, xzr
+
+        umulh   c1, a0, a0
+        umulh   c3, a1, a1
+        umulh   c5, a2, a2
+
+        adds    d1, d1, d1
+        adcs    d2, d2, d2
+        adcs    d3, d3, d3
+        adcs    d4, d4, d4
+        adc     c5, c5, xzr
+
+        adds    c1, c1, d1
+        str     c1, [x0,#8]
+        adcs    c2, c2, d2
+        str     c2, [x0,#16]
+        adcs    c3, c3, d3
+        str     c3, [x0,#24]
+        adcs    c4, c4, d4
+        str     c4, [x0,#32]
+        adc     c5, c5, xzr
+        str     c5, [x0,#40]
+
+// Square the high half
+
+        mul     d1, a3, a4
+        mul     d2, a3, a5
+        mul     d3, a4, a5
+        mul     c0, a3, a3
+        str     c0, [x0,#48]
+        mul     c2, a4, a4
+        mul     c4, a5, a5
+
+        umulh   d4, a3, a4
+        adds    d2, d2, d4
+        umulh   d4, a3, a5
+        adcs    d3, d3, d4
+        umulh   d4, a4, a5
+        adcs    d4, d4, xzr
+
+        umulh   c1, a3, a3
+        umulh   c3, a4, a4
+        umulh   c5, a5, a5
+
+        adds    d1, d1, d1
+        adcs    d2, d2, d2
+        adcs    d3, d3, d3
+        adcs    d4, d4, d4
+        adc     c5, c5, xzr
+
+        adds    c1, c1, d1
+        str     c1, [x0,#56]
+        adcs    c2, c2, d2
+        str     c2, [x0,#64]
+        adcs    c3, c3, d3
+        str     c3, [x0,#72]
+        adcs    c4, c4, d4
+        str     c4, [x0,#80]
+        adc     c5, c5, xzr
+        str     c5, [x0,#88]
+
+// Compute product of the cross-term with ADK 3x3->6 multiplier
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+#define a4 x6
+#define a5 x7
+#define s0 x8
+#define s1 x9
+#define s2 x10
+#define s3 x11
+#define s4 x12
+#define s5 x13
+
+#define l1 x14
+#define l2 x15
+#define h0 x16
+#define h1 x17
+#define h2 x13
+
+#define s6 h1
+#define c  l1
+#define h  l2
+#define l  h0
+#define t  h1
+
+        mul     s0, a0, a3
+        mul     l1, a1, a4
+        mul     l2, a2, a5
+        umulh   h0, a0, a3
+        umulh   h1, a1, a4
+        umulh   h2, a2, a5
+
+        adds    h0, h0, l1
+        adcs    h1, h1, l2
+        adc     h2, h2, xzr
+
+        adds    s1, h0, s0
+        adcs    s2, h1, h0
+        adcs    s3, h2, h1
+        adc     s4, h2, xzr
+
+        adds    s2, s2, s0
+        adcs    s3, s3, h0
+        adcs    s4, s4, h1
+        adc     s5, h2, xzr
+
+        muldiffn c,h,l, t, a0,a1, a4,a3
+        adds    xzr, c, #1
+        adcs    s1, s1, l
+        adcs    s2, s2, h
+        adcs    s3, s3, c
+        adcs    s4, s4, c
+        adc     s5, s5, c
+
+        muldiffn c,h,l, t, a0,a2, a5,a3
+        adds    xzr, c, #1
+        adcs    s2, s2, l
+        adcs    s3, s3, h
+        adcs    s4, s4, c
+        adc     s5, s5, c
+
+        muldiffn c,h,l, t, a1,a2, a5,a4
+        adds    xzr, c, #1
+        adcs    s3, s3, l
+        adcs    s4, s4, h
+        adc     s5, s5, c
+
+// Double it, catching the carry
+
+        adds    s0, s0, s0
+        adcs    s1, s1, s1
+        adcs    s2, s2, s2
+        adcs    s3, s3, s3
+        adcs    s4, s4, s4
+        adcs    s5, s5, s5
+        adc     s6, xzr, xzr
+
+// Finally, add it into the term
+
+        ldr     a0, [x0, #24]
+        adds    a0, a0, s0
+        str     a0, [x0, #24]
+
+        ldr     a0, [x0, #32]
+        adcs    a0, a0, s1
+        str     a0, [x0, #32]
+
+        ldr     a0, [x0, #40]
+        adcs    a0, a0, s2
+        str     a0, [x0, #40]
+
+        ldr     a0, [x0, #48]
+        adcs    a0, a0, s3
+        str     a0, [x0, #48]
+
+        ldr     a0, [x0, #56]
+        adcs    a0, a0, s4
+        str     a0, [x0, #56]
+
+        ldr     a0, [x0, #64]
+        adcs    a0, a0, s5
+        str     a0, [x0, #64]
+
+        ldr     a0, [x0, #72]
+        adcs    a0, a0, s6
+        str     a0, [x0, #72]
+
+        ldr     a0, [x0, #80]
+        adcs    a0, a0, xzr
+        str     a0, [x0, #80]
+
+        ldr     a0, [x0, #88]
+        adc     a0, a0, xzr
+        str     a0, [x0, #88]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12_alt.S
new file mode 100644
index 00000000000..deec8f287fe
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12_alt.S
@@ -0,0 +1,192 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12_alt (uint64_t z[static 12], uint64_t x[static 6]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+#define a4 x6
+#define a5 x7
+
+#define l x8
+
+#define u0 x2 // The same as a0, which is safe
+#define u1 x9
+#define u2 x10
+#define u3 x11
+#define u4 x12
+#define u5 x13
+#define u6 x14
+#define u7 x15
+#define u8 x16
+#define u9 x17
+#define u10 x19
+#define u11 x20
+
+S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
+
+// It's convenient to have two more registers to play with
+
+        stp     x19, x20, [sp, #-16]!
+
+// Load all the elements as [a5;a4;a3;a2;a1;a0], set up an initial
+// window [u8;u7; u6;u5; u4;u3; u2;u1] = [34;05;03;01], and then
+// chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
+// (no carry-out possible since we add it to the top of a product).
+
+        ldp     a0, a1, [x]
+
+        mul     u1, a0, a1
+        umulh   u2, a0, a1
+
+        ldp     a2, a3, [x, #16]
+
+        mul     l, a0, a2
+        adds    u2, u2, l
+
+        mul     u3, a0, a3
+        mul     l, a1, a2
+        adcs    u3, u3, l
+
+        umulh   u4, a0, a3
+        mul     l, a1, a3
+        adcs    u4, u4, l
+
+        ldp     a4, a5, [x, #32]
+
+        mul     u5, a0, a5
+        mul     l, a1, a4
+        adcs    u5, u5, l
+
+        umulh   u6, a0, a5
+        mul     l, a1, a5
+        adcs    u6, u6, l
+
+        mul     u7, a3, a4
+        adcs    u7, u7, xzr
+
+        umulh   u8, a3, a4
+        adc     u8, u8, xzr
+
+        umulh   l, a0, a2
+        adds    u3, u3, l
+        umulh   l, a1, a2
+        adcs    u4, u4, l
+        umulh   l, a1, a3
+        adcs    u5, u5, l
+        umulh   l, a1, a4
+        adcs    u6, u6, l
+        umulh   l, a1, a5
+        adcs    u7, u7, l
+        adc     u8, u8, xzr
+
+// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
+
+        mul     l, a0, a4
+        adds    u4, u4, l
+        mul     l, a2, a3
+        adcs    u5, u5, l
+        mul     l, a2, a4
+        adcs    u6, u6, l
+        mul     l, a2, a5
+        adcs    u7, u7, l
+        mul     l, a3, a5
+        adcs    u8, u8, l
+        mul     u9, a4, a5
+        adcs    u9, u9, xzr
+        umulh   u10, a4, a5
+        adc     u10, u10, xzr
+
+        umulh   l, a0, a4
+        adds    u5, u5, l
+        umulh   l, a2, a3
+        adcs    u6, u6, l
+        umulh   l, a2, a4
+        adcs    u7, u7, l
+        umulh   l, a2, a5
+        adcs    u8, u8, l
+        umulh   l, a3, a5
+        adcs    u9, u9, l
+        adc     u10, u10, xzr
+
+// Double that, with h holding the top carry
+
+        adds    u1, u1, u1
+        adcs    u2, u2, u2
+        adcs    u3, u3, u3
+        adcs    u4, u4, u4
+        adcs    u5, u5, u5
+        adcs    u6, u6, u6
+        adcs    u7, u7, u7
+        adcs    u8, u8, u8
+        adcs    u9, u9, u9
+        adcs    u10, u10, u10
+        cset    u11, cs
+
+// Add the homogeneous terms 00 + 11 + 22 + 33 + 44 + 55
+
+        umulh   l, a0, a0
+        mul     u0, a0, a0
+        adds    u1, u1, l
+
+        mul     l, a1, a1
+        adcs    u2, u2, l
+        umulh   l, a1, a1
+        adcs    u3, u3, l
+
+        mul     l, a2, a2
+        adcs    u4, u4, l
+        umulh   l, a2, a2
+        adcs    u5, u5, l
+
+        mul     l, a3, a3
+        adcs    u6, u6, l
+        umulh   l, a3, a3
+        adcs    u7, u7, l
+
+        mul     l, a4, a4
+        adcs    u8, u8, l
+        umulh   l, a4, a4
+        adcs    u9, u9, l
+
+        mul     l, a5, a5
+        adcs    u10, u10, l
+        umulh   l, a5, a5
+        adc     u11, u11, l
+
+// Store back final result
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+        stp     u4, u5, [z, #32]
+        stp     u6, u7, [z, #48]
+        stp     u8, u9, [z, #64]
+        stp     u10, u11, [z, #80]
+
+// Restore registers and return
+
+        ldp     x19, x20, [sp], #16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16.S
new file mode 100644
index 00000000000..d79f764ea07
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16.S
@@ -0,0 +1,423 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16 (uint64_t z[static 16], uint64_t x[static 8]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
+        .text
+        .balign 4
+
+
+S2N_BN_SYMBOL(bignum_sqr_8_16):
+
+// Save registers
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+
+// Load registers.
+        ldp     x2, x3, [x1]
+ldr     q20, [x1]
+        ldp     x4, x5, [x1, #16]
+ldr     q21, [x1, #16]
+        ldp     x6, x7, [x1, #32]
+ldr     q22, [x1, #32]
+        ldp     x8, x9, [x1, #48]
+ldr     q23, [x1, #48]
+movi    v30.2d, #0xffffffff
+
+        mul     x17, x2, x4
+        mul     x14, x3, x5
+
+// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8
+// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3)
+ext     v1.16b, v20.16b, v20.16b, #8
+        umulh   x20, x2, x4
+shrn    v2.2s, v20.2d, #32
+        subs    x21, x2, x3
+zip1    v0.2s, v20.2s, v1.2s
+        cneg    x21, x21, cc  // cc = lo, ul, last
+umull   v5.2d, v2.2s, v2.2s
+        csetm   x11, cc  // cc = lo, ul, last
+umull   v6.2d, v2.2s, v0.2s
+        subs    x12, x5, x4
+umull   v3.2d, v0.2s, v0.2s
+        cneg    x12, x12, cc  // cc = lo, ul, last
+mov     v1.16b, v6.16b
+        mul     x13, x21, x12
+usra    v1.2d, v3.2d, #32
+        umulh   x12, x21, x12
+and     v4.16b, v1.16b, v30.16b
+        cinv    x11, x11, cc  // cc = lo, ul, last
+add     v4.2d, v4.2d, v6.2d
+        eor     x13, x13, x11
+usra    v5.2d, v4.2d, #32
+        eor     x12, x12, x11
+sli     v3.2d, v4.2d, #32
+        adds    x19, x17, x20
+usra    v5.2d, v1.2d, #32
+        adc     x20, x20, xzr
+  // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5)
+  ext   v1.16b, v21.16b, v21.16b, #8
+        umulh   x21, x3, x5
+  shrn  v2.2s, v21.2d, #32
+        adds    x19, x19, x14
+  zip1  v0.2s, v21.2s, v1.2s
+        adcs    x20, x20, x21
+        adc     x21, x21, xzr
+        adds    x20, x20, x14
+        adc     x21, x21, xzr
+        cmn     x11, #0x1
+        adcs    x19, x19, x13
+mov     x13, v3.d[1] // mul     x13, x3, x3
+        adcs    x20, x20, x12
+mov     x14, v5.d[1] // umulh   x14, x3, x3
+        adc     x21, x21, x11
+mov     x12, v3.d[0] // mul     x12, x2, x2
+        adds    x17, x17, x17
+mov     x11, v5.d[0] // umulh   x11, x2, x2
+        adcs    x19, x19, x19
+  umull v5.2d, v2.2s, v2.2s
+        adcs    x20, x20, x20
+  umull v6.2d, v2.2s, v0.2s
+        adcs    x21, x21, x21
+  umull v3.2d, v0.2s, v0.2s
+        adc     x10, xzr, xzr
+  mov   v1.16b, v6.16b
+
+        mul     x15, x2, x3
+  usra  v1.2d, v3.2d, #32
+        umulh   x16, x2, x3
+  and   v4.16b, v1.16b, v30.16b
+        adds    x11, x11, x15
+  add   v4.2d, v4.2d, v6.2d
+        adcs    x13, x13, x16
+  usra  v5.2d, v4.2d, #32
+        adc     x14, x14, xzr
+  sli   v3.2d, v4.2d, #32
+        adds    x11, x11, x15
+  usra  v5.2d, v1.2d, #32
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        stp     x12, x11, [x0]
+  mov   x11, v5.d[0] // umulh   x11, x4, x4
+        adds    x17, x17, x13
+  mov   x13, v3.d[1] // mul     x13, x5, x5
+        adcs    x19, x19, x14
+  mov   x14, v5.d[1] // umulh   x14, x5, x5
+        adcs    x20, x20, xzr
+  mov   x12, v3.d[0] // mul     x12, x4, x4
+        adcs    x21, x21, xzr
+// NEON: prepare muls in the upper half
+ext     v1.16b, v22.16b, v22.16b, #8
+        adc     x10, x10, xzr
+shrn    v2.2s, v22.2d, #32
+        stp     x17, x19, [x0, #16]
+zip1    v0.2s, v22.2s, v1.2s
+        mul     x15, x4, x5
+umull   v5.2d, v2.2s, v2.2s
+        umulh   x16, x4, x5
+umull   v6.2d, v2.2s, v0.2s
+        adds    x11, x11, x15
+umull   v3.2d, v0.2s, v0.2s
+        adcs    x13, x13, x16
+mov     v1.16b, v6.16b
+        adc     x14, x14, xzr
+usra    v1.2d, v3.2d, #32
+        adds    x11, x11, x15
+and     v4.16b, v1.16b, v30.16b
+        adcs    x13, x13, x16
+add     v4.2d, v4.2d, v6.2d
+        adc     x14, x14, xzr
+usra    v5.2d, v4.2d, #32
+        adds    x12, x12, x20
+sli     v3.2d, v4.2d, #32
+        adcs    x11, x11, x21
+usra    v5.2d, v1.2d, #32
+        stp     x12, x11, [x0, #32]
+  // NEON: prepare muls in the upper half
+  ext   v1.16b, v23.16b, v23.16b, #8
+        adcs    x13, x13, x10
+  shrn  v2.2s, v23.2d, #32
+        adc     x14, x14, xzr
+  zip1  v0.2s, v23.2s, v1.2s
+        stp     x13, x14, [x0, #48]
+
+// Scalar: square the upper half with a slight variant of the previous block
+        mul     x17, x6, x8
+  umull v16.2d, v2.2s, v2.2s
+        mul     x14, x7, x9
+  umull v6.2d, v2.2s, v0.2s
+        umulh   x20, x6, x8
+  umull v18.2d, v0.2s, v0.2s
+        subs    x21, x6, x7
+        cneg    x21, x21, cc  // cc = lo, ul, last
+  mov   v1.16b, v6.16b
+        csetm   x11, cc  // cc = lo, ul, last
+        subs    x12, x9, x8
+        cneg    x12, x12, cc  // cc = lo, ul, last
+  usra  v1.2d, v18.2d, #32
+        mul     x13, x21, x12
+  and   v4.16b, v1.16b, v30.16b
+        umulh   x12, x21, x12
+  add   v4.2d, v4.2d, v6.2d
+        cinv    x11, x11, cc  // cc = lo, ul, last
+        eor     x13, x13, x11
+        eor     x12, x12, x11
+  usra  v16.2d, v4.2d, #32
+        adds    x19, x17, x20
+        adc     x20, x20, xzr
+  sli   v18.2d, v4.2d, #32
+        umulh   x21, x7, x9
+        adds    x19, x19, x14
+        adcs    x20, x20, x21
+        adc     x21, x21, xzr
+        adds    x20, x20, x14
+mov     x14, v5.d[1]
+        adc     x21, x21, xzr
+        cmn     x11, #0x1
+        adcs    x19, x19, x13
+mov     x13, v3.d[1]
+        adcs    x20, x20, x12
+mov     x12, v3.d[0]
+        adc     x21, x21, x11
+mov     x11, v5.d[0]
+        adds    x17, x17, x17
+        adcs    x19, x19, x19
+  usra  v16.2d, v1.2d, #32
+        adcs    x20, x20, x20
+        adcs    x21, x21, x21
+        adc     x10, xzr, xzr
+// NEON: two mul+umulhs for the next stage
+uzp2    v17.4s, v21.4s, v23.4s
+        mul     x15, x6, x7
+xtn     v4.2s, v23.2d
+        umulh   x16, x6, x7
+  mov   x22, v16.d[0]
+        adds    x11, x11, x15
+        adcs    x13, x13, x16
+xtn     v5.2s, v21.2d
+        adc     x14, x14, xzr
+        adds    x11, x11, x15
+rev64   v1.4s, v21.4s
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        stp     x12, x11, [x0, #64]
+        adds    x17, x17, x13
+  mov   x13, v18.d[1]
+        adcs    x19, x19, x14
+  mov   x14, v16.d[1]
+        adcs    x20, x20, xzr
+  mov   x12, v18.d[0]
+        adcs    x21, x21, xzr
+        adc     x10, x10, xzr
+umull   v6.2d, v4.2s, v5.2s
+        stp     x17, x19, [x0, #80]
+umull   v7.2d, v4.2s, v17.2s
+        mul     x15, x8, x9
+uzp2    v16.4s, v23.4s, v23.4s
+        umulh   x16, x8, x9
+mul     v0.4s, v1.4s, v23.4s
+        adds    x11, x22, x15
+        adcs    x13, x13, x16
+usra    v7.2d, v6.2d, #32
+        adc     x14, x14, xzr
+        adds    x11, x11, x15
+umull   v1.2d, v16.2s, v17.2s
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+uaddlp  v0.2d, v0.4s
+        adds    x12, x12, x20
+        adcs    x11, x11, x21
+and     v2.16b, v7.16b, v30.16b
+umlal   v2.2d, v16.2s, v5.2s
+shl     v0.2d, v0.2d, #32
+usra    v1.2d, v7.2d, #32
+umlal   v0.2d, v4.2s, v5.2s
+mov     x16, v0.d[1]
+mov     x15, v0.d[0]
+usra    v1.2d, v2.2d, #32
+mov     x20, v1.d[0]
+mov     x21, v1.d[1]
+        stp     x12, x11, [x0, #96]
+        adcs    x13, x13, x10
+        adc     x14, x14, xzr
+        stp     x13, x14, [x0, #112]
+
+// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0]
+
+        mul     x10, x2, x6
+        mul     x14, x3, x7
+        umulh   x17, x2, x6
+        adds    x14, x14, x17
+        umulh   x17, x3, x7
+        adcs    x15, x15, x17
+        adcs    x16, x16, x20
+        adc     x17, x21, xzr
+        adds    x11, x14, x10
+        adcs    x14, x15, x14
+        adcs    x15, x16, x15
+        adcs    x16, x17, x16
+        adc     x17, xzr, x17
+        adds    x12, x14, x10
+        adcs    x13, x15, x11
+        adcs    x14, x16, x14
+        adcs    x15, x17, x15
+        adcs    x16, xzr, x16
+        adc     x17, xzr, x17
+        subs    x22, x4, x5
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
+        subs    x20, x9, x8
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        mul     x21, x22, x20
+        umulh   x20, x22, x20
+        cinv    x19, x19, cc  // cc = lo, ul, last
+        cmn     x19, #0x1
+        eor     x21, x21, x19
+        adcs    x15, x15, x21
+        eor     x20, x20, x19
+        adcs    x16, x16, x20
+        adc     x17, x17, x19
+        subs    x22, x2, x3
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
+        subs    x20, x7, x6
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        mul     x21, x22, x20
+        umulh   x20, x22, x20
+        cinv    x19, x19, cc  // cc = lo, ul, last
+        cmn     x19, #0x1
+        eor     x21, x21, x19
+        adcs    x11, x11, x21
+        eor     x20, x20, x19
+        adcs    x12, x12, x20
+        adcs    x13, x13, x19
+        adcs    x14, x14, x19
+        adcs    x15, x15, x19
+        adcs    x16, x16, x19
+        adc     x17, x17, x19
+        subs    x22, x3, x5
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
+        subs    x20, x9, x7
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        mul     x21, x22, x20
+        umulh   x20, x22, x20
+        cinv    x19, x19, cc  // cc = lo, ul, last
+        cmn     x19, #0x1
+        eor     x21, x21, x19
+        adcs    x14, x14, x21
+        eor     x20, x20, x19
+        adcs    x15, x15, x20
+        adcs    x16, x16, x19
+        adc     x17, x17, x19
+        subs    x22, x2, x4
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
+        subs    x20, x8, x6
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        mul     x21, x22, x20
+        umulh   x20, x22, x20
+        cinv    x19, x19, cc  // cc = lo, ul, last
+        cmn     x19, #0x1
+        eor     x21, x21, x19
+        adcs    x12, x12, x21
+        eor     x20, x20, x19
+        adcs    x13, x13, x20
+        adcs    x14, x14, x19
+        adcs    x15, x15, x19
+        adcs    x16, x16, x19
+        adc     x17, x17, x19
+        subs    x22, x2, x5
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
+        subs    x20, x9, x6
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        mul     x21, x22, x20
+        umulh   x20, x22, x20
+        cinv    x19, x19, cc  // cc = lo, ul, last
+        cmn     x19, #0x1
+        eor     x21, x21, x19
+        adcs    x13, x13, x21
+        eor     x20, x20, x19
+        adcs    x14, x14, x20
+        adcs    x15, x15, x19
+        adcs    x16, x16, x19
+        adc     x17, x17, x19
+        subs    x22, x3, x4
+        cneg    x22, x22, cc  // cc = lo, ul, last
+        csetm   x19, cc  // cc = lo, ul, last
+        subs    x20, x8, x7
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        mul     x21, x22, x20
+        umulh   x20, x22, x20
+        cinv    x19, x19, cc  // cc = lo, ul, last
+        cmn     x19, #0x1
+        eor     x21, x21, x19
+        adcs    x13, x13, x21
+        eor     x20, x20, x19
+        adcs    x14, x14, x20
+        adcs    x15, x15, x19
+        adcs    x16, x16, x19
+        adc     x17, x17, x19
+        adds    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        adcs    x15, x15, x15
+        adcs    x16, x16, x16
+        adcs    x17, x17, x17
+        adc     x19, xzr, xzr
+
+// Add it back to the buffer
+
+        ldp     x2, x3, [x0, #32]
+        adds    x10, x10, x2
+        adcs    x11, x11, x3
+        stp     x10, x11, [x0, #32]
+
+        ldp     x2, x3, [x0, #48]
+        adcs    x12, x12, x2
+        adcs    x13, x13, x3
+        stp     x12, x13, [x0, #48]
+
+        ldp     x2, x3, [x0, #64]
+        adcs    x14, x14, x2
+        adcs    x15, x15, x3
+        stp     x14, x15, [x0, #64]
+
+        ldp     x2, x3, [x0, #80]
+        adcs    x16, x16, x2
+        adcs    x17, x17, x3
+        stp     x16, x17, [x0, #80]
+
+        ldp     x2, x3, [x0, #96]
+        adcs    x2, x2, x19
+        adcs    x3, x3, xzr
+        stp     x2, x3, [x0, #96]
+
+        ldp     x2, x3, [x0, #112]
+        adcs    x2, x2, xzr
+        adc     x3, x3, xzr
+        stp     x2, x3, [x0, #112]
+
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16_alt.S
new file mode 100644
index 00000000000..2faf94d00e4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16_alt.S
@@ -0,0 +1,280 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16_alt
+//     (uint64_t z[static 16], uint64_t x[static 8]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+#define a4 x6
+#define a5 x7
+#define a6 x8
+#define a7 x9
+
+#define l x10
+
+#define u0 x2 // The same as a0, which is safe
+#define u1 x11
+#define u2 x12
+#define u3 x13
+#define u4 x14
+#define u5 x15
+#define u6 x16
+#define u7 x17
+#define u8 x19
+#define u9 x20
+#define u10 x21
+#define u11 x22
+#define u12 x23
+#define u13 x24
+#define u14 x25
+#define u15 x26
+
+S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
+
+// It's convenient to have more registers to play with
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+
+// Load all the elements as [a7;a6;a5;a4;a3;a2;a1;a0], set up an initial
+// window [u8;u7;u6;u5;u4;u3;u2;u1] =  10 + 20 + 30 + 40 + 50 + 60 + 70
+
+        ldp     a0, a1, [x]
+
+        mul     u1, a0, a1
+        umulh   u2, a0, a1
+
+        ldp     a2, a3, [x, #16]
+
+        mul     l, a0, a2
+        umulh   u3, a0, a2
+        adds    u2, u2, l
+
+        ldp     a4, a5, [x, #32]
+
+        mul     l, a0, a3
+        umulh   u4, a0, a3
+        adcs    u3, u3, l
+
+        ldp     a6, a7, [x, #48]
+
+        mul     l, a0, a4
+        umulh   u5, a0, a4
+        adcs    u4, u4, l
+
+        mul     l, a0, a5
+        umulh   u6, a0, a5
+        adcs    u5, u5, l
+
+        mul     l, a0, a6
+        umulh   u7, a0, a6
+        adcs    u6, u6, l
+
+        mul     l, a0, a7
+        umulh   u8, a0, a7
+        adcs    u7, u7, l
+
+        adc     u8, u8, xzr
+
+// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
+
+        mul     l, a1, a2
+        adds    u3, u3, l
+        mul     l, a1, a3
+        adcs    u4, u4, l
+        mul     l, a1, a4
+        adcs    u5, u5, l
+        mul     l, a1, a5
+        adcs    u6, u6, l
+        mul     l, a1, a6
+        adcs    u7, u7, l
+        mul     l, a1, a7
+        adcs    u8, u8, l
+        cset    u9, cs
+
+        umulh   l, a1, a2
+        adds    u4, u4, l
+        umulh   l, a1, a3
+        adcs    u5, u5, l
+        umulh   l, a1, a4
+        adcs    u6, u6, l
+        umulh   l, a1, a5
+        adcs    u7, u7, l
+        umulh   l, a1, a6
+        adcs    u8, u8, l
+        umulh   l, a1, a7
+        adc     u9, u9, l
+        mul     l, a4, a5
+        umulh   u10, a4, a5
+        adds    u9, u9, l
+        adc     u10, u10, xzr
+
+// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
+
+        mul     l, a2, a3
+        adds    u5, u5, l
+        mul     l, a2, a4
+        adcs    u6, u6, l
+        mul     l, a2, a5
+        adcs    u7, u7, l
+        mul     l, a2, a6
+        adcs    u8, u8, l
+        mul     l, a2, a7
+        adcs    u9, u9, l
+        mul     l, a4, a6
+        adcs    u10, u10, l
+        cset    u11, cs
+
+        umulh   l, a2, a3
+        adds    u6, u6, l
+        umulh   l, a2, a4
+        adcs    u7, u7, l
+        umulh   l, a2, a5
+        adcs    u8, u8, l
+        umulh   l, a2, a6
+        adcs    u9, u9, l
+        umulh   l, a2, a7
+        adcs    u10, u10, l
+        umulh   l, a4, a6
+        adc     u11, u11, l
+        mul     l, a5, a6
+        umulh   u12, a5, a6
+        adds    u11, u11, l
+        adc     u12, u12, xzr
+
+// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
+
+        mul     l, a3, a4
+        adds    u7, u7, l
+        mul     l, a3, a5
+        adcs    u8, u8, l
+        mul     l, a3, a6
+        adcs    u9, u9, l
+        mul     l, a3, a7
+        adcs    u10, u10, l
+        mul     l, a4, a7
+        adcs    u11, u11, l
+        mul     l, a5, a7
+        adcs    u12, u12, l
+        cset    u13, cs
+
+        umulh   l, a3, a4
+        adds    u8, u8, l
+        umulh   l, a3, a5
+        adcs    u9, u9, l
+        umulh   l, a3, a6
+        adcs    u10, u10, l
+        umulh   l, a3, a7
+        adcs    u11, u11, l
+        umulh   l, a4, a7
+        adcs    u12, u12, l
+        umulh   l, a5, a7
+        adc     u13, u13, l
+        mul     l, a6, a7
+        umulh   u14, a6, a7
+        adds    u13, u13, l
+        adc     u14, u14, xzr
+
+// Double that, with u15 holding the top carry
+
+        adds    u1, u1, u1
+        adcs    u2, u2, u2
+        adcs    u3, u3, u3
+        adcs    u4, u4, u4
+        adcs    u5, u5, u5
+        adcs    u6, u6, u6
+        adcs    u7, u7, u7
+        adcs    u8, u8, u8
+        adcs    u9, u9, u9
+        adcs    u10, u10, u10
+        adcs    u11, u11, u11
+        adcs    u12, u12, u12
+        adcs    u13, u13, u13
+        adcs    u14, u14, u14
+        cset    u15, cs
+
+// Add the homogeneous terms 00 + 11 + 22 + 33 + 44 + 55 + 66 + 77
+
+        umulh   l, a0, a0
+        mul     u0, a0, a0
+        adds    u1, u1, l
+
+        mul     l, a1, a1
+        adcs    u2, u2, l
+        umulh   l, a1, a1
+        adcs    u3, u3, l
+
+        mul     l, a2, a2
+        adcs    u4, u4, l
+        umulh   l, a2, a2
+        adcs    u5, u5, l
+
+        mul     l, a3, a3
+        adcs    u6, u6, l
+        umulh   l, a3, a3
+        adcs    u7, u7, l
+
+        mul     l, a4, a4
+        adcs    u8, u8, l
+        umulh   l, a4, a4
+        adcs    u9, u9, l
+
+        mul     l, a5, a5
+        adcs    u10, u10, l
+        umulh   l, a5, a5
+        adcs    u11, u11, l
+
+        mul     l, a6, a6
+        adcs    u12, u12, l
+        umulh   l, a6, a6
+        adcs    u13, u13, l
+
+        mul     l, a7, a7
+        adcs    u14, u14, l
+        umulh   l, a7, a7
+        adc     u15, u15, l
+
+// Store back final result
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+        stp     u4, u5, [z, #32]
+        stp     u6, u7, [z, #48]
+        stp     u8, u9, [z, #64]
+        stp     u10, u11, [z, #80]
+        stp     u12, u13, [z, #96]
+        stp     u14, u15, [z, #112]
+
+// Restore registers and return
+
+        ldp     x25, x26, [sp], #16
+        ldp     x23, x24, [sp], #16
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_base.S
similarity index 90%
rename from third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_base.S
index 081f5de362d..41e63d1fa27 100644
--- a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_base.S
@@ -5,7 +5,7 @@
 // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer
 // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
 //
-//    extern uint64_t bignum_emontredc_8n
+//    extern uint64_t bignum_emontredc_8n_base
 //     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
 //
 // Functionally equivalent to bignum_emontredc (see that file for more detail).
@@ -15,8 +15,8 @@
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_base)
         .text
         .balign 4
 
@@ -29,18 +29,18 @@
 // ---------------------------------------------------------------------------
 
 #define muldiffnadd(b,a, c,h,l,t, x,y, w,z) \
-        subs    t, x, y ; \
-        cneg    t, t, cc ; \
-        csetm   c, cc ; \
-        subs    h, w, z ; \
-        cneg    h, h, cc ; \
-        mul     l, t, h ; \
-        umulh   h, t, h ; \
-        cinv    c, c, cc ; \
-        adds    xzr, c, #1 ; \
-        eor     l, l, c ; \
-        adcs    a, a, l ; \
-        eor     h, h, c ; \
+        subs    t, x, y  __LF\
+        cneg    t, t, cc  __LF\
+        csetm   c, cc  __LF\
+        subs    h, w, z  __LF\
+        cneg    h, h, cc  __LF\
+        mul     l, t, h  __LF\
+        umulh   h, t, h  __LF\
+        cinv    c, c, cc  __LF\
+        adds    xzr, c, #1  __LF\
+        eor     l, l, c  __LF\
+        adcs    a, a, l  __LF\
+        eor     h, h, c  __LF\
         adcs    b, b, h
 
 // The inputs, though k gets processed so we use a different name
@@ -196,9 +196,9 @@
 // Main code
 // *****************************************************
 
-S2N_BN_SYMBOL(bignum_emontredc_8n):
+S2N_BN_SYMBOL(bignum_emontredc_8n_base):
 
-stp     x19, x20, [sp, #-16]!
+        stp     x19, x20, [sp, #-16]!
         stp     x21, x22, [sp, #-16]!
         stp     x23, x24, [sp, #-16]!
         stp     x25, x26, [sp, #-16]!
@@ -211,7 +211,7 @@ stp     x19, x20, [sp, #-16]!
         lsr     k4m1, x0, #2
         mov     i, k4m1
         subs    c, k4m1, #1
-        bcc     bignum_emontredc_8n_end
+        bcc     bignum_emontredc_8n_base_end
         mov     tc, xzr
         lsl     k4m1, c, #5
 
@@ -219,7 +219,7 @@ stp     x19, x20, [sp, #-16]!
 // Rather than propagating the carry to the end each time, we
 // stop at the "natural" end and store top carry in tc as a bitmask.
 
-bignum_emontredc_8n_outerloop:
+bignum_emontredc_8n_base_outerloop:
 
 // Load [u3;u2;u1;u0] = bottom 4 digits of the input at current window
 
@@ -325,9 +325,8 @@ bignum_emontredc_8n_outerloop:
 
 // Repeated multiply-add block to do the k/4-1 remaining 4-digit chunks
 
-        cbz     k4m1, bignum_emontredc_8n_madddone
         mov     j, k4m1
-bignum_emontredc_8n_maddloop:
+bignum_emontredc_8n_base_maddloop:
         add     m, m, #32
         add     z, z, #32
 
@@ -335,8 +334,8 @@ bignum_emontredc_8n_maddloop:
         ldp     b2, b3, [m, #16]
         madd4
         subs    j, j, #32
-        bne     bignum_emontredc_8n_maddloop
-bignum_emontredc_8n_madddone:
+        bne     bignum_emontredc_8n_base_maddloop
+bignum_emontredc_8n_base_madddone:
 
 // Add the carry out to the existing z contents, propagating the
 // top carry tc up by 32 places as we move "leftwards".
@@ -360,14 +359,14 @@ bignum_emontredc_8n_madddone:
 // Bump up z only and keep going
 
         add     z, z, #32
-        subs    i, i, #1
-        bne     bignum_emontredc_8n_outerloop
+        sub     i, i, #1
+        cbnz    i, bignum_emontredc_8n_base_outerloop
 
 // Return the top carry as 0 or 1 (it's currently a bitmask)
 
         neg     x0, tc
 
-bignum_emontredc_8n_end:
+bignum_emontredc_8n_base_end:
         ldp     x27, x28, [sp], #16
         ldp     x25, x26, [sp], #16
         ldp     x23, x24, [sp], #16
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_cdiff_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_cdiff_base.S
new file mode 100644
index 00000000000..41c700b1d03
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_cdiff_base.S
@@ -0,0 +1,681 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Extend Montgomery reduce in 8-digit blocks, uses an extra storage to
+// temporarily cache multiplied differences appearing in ADK.
+// Results are stored in input-output buffer (z).
+// Inputs z[2*k], m[k], w;
+// Outputs function return (extra result bit) and z[2*k]
+// Temporary buffer m_precalc[12*(k/4-1)]
+//
+//    extern uint64_t bignum_emontredc_8n_cdiff
+//     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w, uint64_t *m_precalc);
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, X4 = m_precalc
+//                   returns X0
+//
+// This is an unoptimized version of bignum_emontredc_8n_cdiff.
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+                                        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_cdiff_base)
+                                        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_cdiff_base)
+                                        .text
+                                        .balign 4
+
+        // Silly SLOTHY limitation: It needs the loop counter to have the name 'count'
+        count .req x27 // inner loop counter
+
+        // Semantically transparent instruction wrapper which is used by SLOTHY
+        // for dependency tracking through memory. SLOTHY itself has no notion of
+        // memory, only registers; to still track static dependencies through memory
+        // (register spills), a 'hint' register type is introduced (syntax t{i}, t{i}{j})
+        // that's written to in store instructions and read from in the corresponding
+        // load instruction.
+        //
+        // The 'slothy:no-unfold' annotation prevents SLOTHY from opening the macro,
+        // and instead makes it treat `stph` as an instruction specified in the
+        // Arch and uArch models provided to it.
+        .macro stph a, b, addr, imm, hint // slothy:no-unfold
+            stp \a\(), \b\(), [\addr, \imm]
+        .endm
+
+        .macro ldph a, b, addr, imm, hint // slothy:no-unfold
+            ldp \a\(), \b\(), [\addr, \imm]
+        .endm
+
+        .macro ldrh a, addr, imm, hint // slothy:no-unfold
+            ldr \a\(), [\addr, \imm]
+        .endm
+
+        // Helper macro for the pre-computations
+        .macro cdiff t, c, x, y
+          subs    \t, \x, \y
+          cneg    \t, \t, cc
+          csetm   \c, cc
+        .endm
+
+        // Some immediate offsets for cached differences+carry used
+        // in the inner ADK multiplications
+        #define cache_a01 (32+0*16)
+        #define cache_a02 (32+1*16)
+        #define cache_a03 (32+2*16)
+        #define cache_a12 (32+3*16)
+        #define cache_a13 (32+4*16)
+        #define cache_a23 (32+5*16)
+        #define cache_m10 (0*16)
+        #define cache_m20 (1*16)
+        #define cache_m30 (2*16)
+        #define cache_m21 (3*16)
+        #define cache_m31 (4*16)
+        #define cache_m32 (5*16)
+
+        a0 .req x4
+        a1 .req x5
+        a2 .req x6
+        a3 .req x7
+
+        vpre00 .req v30
+        vpre01 .req v28
+        vpre02 .req v17
+        vpre10 .req v18
+        vpre11 .req v19
+        vpre12 .req v20
+
+        // Computes two 64x64->128-bit multiplication a*x and a*y
+        // v_in0: 128-bit input vector viewed as pair (x,y) of 64-bit numbers
+        // x_in: 64-bit common multiplicand a
+        // v_out0: 128-bit output vector to hold a*x
+        // v_out1: 128-bit output vector to hold a*y
+        //
+        // Uses temporaries as indicated in the following defines:
+        #define v_in0_p   v3
+        #define v_in0_pp  v5
+        #define v_in0_ppp v28
+        #define v_in1 v0
+        #define vtmp0 v4
+        #define vtmp1 v6
+        #define vtmp2 v7
+        #define vtmp3 v16
+        #define vtmp4 v2
+
+        .macro vmul_2x_64_64_128 v_in0, x_in, v_out0, v_out1 // slothy:no-unfold
+          dup     v_in1.2d, \x_in
+          uzp2    v_in0_p.4s, \v_in0\().4s, \v_in0\().4s
+          xtn     vtmp0.2s, v_in1.2d
+          xtn     v_in0_pp.2s, \v_in0\().2d
+          rev64   v_in0_ppp.4s, \v_in0\().4s
+          umull   vtmp1.2d, vtmp0.2s, v_in0_pp.2s
+          umull   vtmp2.2d, vtmp0.2s, v_in0_p.2s
+          uzp2    vtmp3.4s, v_in1.4s, v_in1.4s
+          mul     v_in1.4s, v_in0_ppp.4s, v_in1.4s
+          usra    vtmp2.2d, vtmp1.2d, #32
+          umull   \v_out1\().2d, vtmp3.2s, v_in0_p.2s
+          uaddlp  v_in1.2d, v_in1.4s
+          and     vtmp4.16b, vtmp2.16b, v29.16b
+          umlal   vtmp4.2d, vtmp3.2s, v_in0_pp.2s
+          shl     \v_out0\().2d, v_in1.2d, #32
+          usra    \v_out1\().2d, vtmp2.2d, #32
+          umlal   \v_out0\().2d, vtmp0.2s, v_in0_pp.2s
+          usra    \v_out1\().2d, vtmp4.2d, #32
+        .endm
+
+        // SLOTHY version of the above multiplication macro, using symbolic
+        // registers instead of hardcoded registers. This is only used during
+        // SLOTHY optimization (the above macro is ignored because of
+        // 'slothy:no-unfold').
+#if defined(SLOTHY)
+        .macro vmul_2x_64_64_128 v_in0, x_in, v_out0, v_out1
+          dup     V<in1>.2d, \x_in
+          uzp2    V<in0_p>.4s, \v_in0\().4s, \v_in0\().4s
+          xtn     V<tmp0>.2s, V<in1>.2d
+          xtn     V<in0_pp>.2s, \v_in0\().2d
+          rev64   V<in0_ppp>.4s, \v_in0\().4s
+          umull   V<tmp1>.2d, V<tmp0>.2s, V<in0_pp>.2s
+          umull   V<tmp2>.2d, V<tmp0>.2s, V<in0_p>.2s
+          uzp2    V<tmp3>.4s, V<in1>.4s, V<in1>.4s
+          mul     V<in1>.4s, V<in0_ppp>.4s, V<in1>.4s
+          usra    V<tmp2>.2d, V<tmp1>.2d, #32
+          umull   \v_out1\().2d, V<tmp3>.2s, V<in0_p>.2s
+          uaddlp  V<in1>.2d, V<in1>.4s
+          and     V<tmp4>.16b, V<tmp2>.16b, v29.16b
+          umlal   V<tmp4>.2d, V<tmp3>.2s, V<in0_pp>.2s
+          shl     \v_out0\().2d, V<in1>.2d, #32
+          usra    \v_out1\().2d, V<tmp2>.2d, #32
+          umlal   \v_out0\().2d, V<tmp0>.2s, V<in0_pp>.2s
+          usra    \v_out1\().2d, V<tmp4>.2d, #32
+        .endm
+#endif
+
+S2N_BN_SYMBOL(bignum_emontredc_8n_cdiff_base):
+
+           sub sp, sp, #(6*16)
+           stp x19, x20, [sp, #(5*16)]
+           stp x21, x22, [sp, #(4*16)]
+           stp x23, x24, [sp, #(3*16)]
+           stp x25, x26, [sp, #(2*16)]
+           stp x27, x28, [sp, #(1*16)]
+           stp x29, x30, [sp, #(0*16)]
+
+           // Leave space for cached differences in inner loop
+           sub sp, sp, #(6*16)
+
+           sub sp, sp, #32
+           lsr x0, x0, #2
+           mov x26, x0
+           subs x12, x0, #1
+           bcc bignum_emontredc_8n_cdiff_base_end
+
+           // x30 = buffer holding precomputed ADK carry-differences for modulus
+           mov w30, #(12*8)
+           mul w30, w12, w30
+           sub x30, sp, x30
+
+           //
+           // Start of precomputation
+           //
+           // Precompute and cache signed differences of modulus components
+           // used in the ADK multiplication in the inner loop.
+           //
+           // THIS SHOULD BE HOISTED OUT
+           // (and until then, comment out for benchmarking to get accurate estimates)
+           //
+
+           // Number of extra limbs required:
+           // 6 * (number of limbs / 4 - 1) * 2 = 12 * (number_of_limbs/4 - 1)
+           //
+           // For now, just put them on the stack
+           mov sp, x30
+
+           // Save modulus pointer
+           mov x25, x2
+
+           mov count, x12
+bignum_emontredc_8n_cdiff_base_precomp:
+           ldp a0, a1, [x2, #32]!
+           ldp a2, a3, [x2, #16]
+
+           t .req x28
+           c .req x29
+
+           cdiff t, c, a1, a0
+           stp   t, c, [sp, #cache_m10]
+           cdiff t, c, a2, a0
+           stp   t, c, [sp, #cache_m20]
+           cdiff t, c, a3, a0
+           stp   t, c, [sp, #cache_m30]
+           cdiff t, c, a2, a1
+           stp   t, c, [sp, #cache_m21]
+           cdiff t, c, a3, a1
+           stp   t, c, [sp, #cache_m31]
+           cdiff t, c, a3, a2
+           stp   t, c, [sp, #cache_m32]
+
+           add sp, sp, #(6*16)
+
+           subs count, count, #1
+           cbnz count, bignum_emontredc_8n_cdiff_base_precomp
+
+           // Set modulus pointer back to its original value
+           mov x2, x25
+
+           //
+           // End of precomputation
+           //
+
+           stp x3, x30, [sp]
+           //stp x3, xzr, [sp]
+           stp x26, xzr, [sp, #16]
+           mov x28, xzr
+           lsl x0, x12, #5
+
+           movi    v29.2d, #0x000000ffffffff
+
+bignum_emontredc_8n_cdiff_base_outerloop:
+          ldr x3, [sp]
+          ldph x17, x19, x1, #0, t0
+          ldph x20, x21, x1, #16, t1
+          ldp x8, x9, [x2, #0]
+          ldp x10, x11, [x2, #16]
+          ldr q21, [x2, #16]
+
+          // Montgomery step 0
+
+          mul x4, x17, x3
+          // NEON: Calculate x4 * (x10, x11) that does two 64x64->128-bit multiplications.
+          vmul_2x_64_64_128 v21, x4, v0, v1
+          mov x14, v0.d[0]
+          mov x15, v0.d[1]
+          mul   x12, x4, x8
+          adds x17, x17, x12
+          umulh x12, x4, x8
+          mul   x13, x4, x9
+          adcs x19, x19, x13
+          umulh x13, x4, x9
+          adcs x20, x20, x14
+          adcs x21, x21, x15
+          mov x14, v1.d[0]
+          mov x15, v1.d[1]
+          adc x22, xzr, xzr
+          adds x19, x19, x12
+          adcs x20, x20, x13
+          adcs x21, x21, x14
+          adc x22, x22, x15
+
+          // Montgomery step 1
+
+          mul x5, x19, x3
+          // NEON: Calculate x5 * (x10, x11) that does two 64x64->128-bit multiplications.
+          vmul_2x_64_64_128 v21, x5, v0, v1
+          mov x14, v0.d[0]
+          mov x15, v0.d[1]
+          mul   x12, x5, x8
+          adds  x19, x19, x12
+          umulh x12, x5, x8
+          mul   x13, x5, x9
+          adcs  x20, x20, x13
+          umulh x13, x5, x9
+          adcs x21, x21, x14
+          adcs x22, x22, x15
+          mov x14, v1.d[0]
+          mov x15, v1.d[1]
+          adc x23, xzr, xzr
+          adds x20, x20, x12
+          adcs x21, x21, x13
+          adcs x22, x22, x14
+          adc x23, x23, x15
+          stph x4, x5, x1, #0, t0
+
+          // Montgomery step 2
+
+          mul   x6, x20, x3
+          // NEON: Calculate x6 * (x10, x11) that does two 64x64->128-bit multiplications.
+          vmul_2x_64_64_128 v21, x6, v21, v1
+          mov   x14, v21.d[0]
+          mov   x15, v21.d[1]
+          mul   x12, x6, x8
+          adds  x20, x20, x12
+          umulh x12, x6, x8
+          mul   x13, x6, x9
+          adcs  x21, x21, x13
+          umulh x13, x6, x9
+          adcs  x22, x22, x14
+          adcs  x23, x23, x15
+          mov   x14, v1.d[0]
+          mov   x15, v1.d[1]
+          adc   x24, xzr, xzr
+          adds  x21, x21, x12
+          mul   x7, x21, x3
+          adcs  x22, x22, x13
+          adcs  x23, x23, x14
+          adc   x24, x24, x15
+
+          stph x6, x7, x1, #16, t1
+
+          // Montgomery step 3
+
+          mul   x12, x7, x8
+          mul   x13, x7, x9
+          mul   x14, x7, x10
+          mul   x15, x7, x11
+          adds  x21, x21, x12
+          umulh x12, x7, x8
+          adcs  x22, x22, x13
+          umulh x13, x7, x9
+          adcs  x23, x23, x14
+          umulh x14, x7, x10
+          adcs  x24, x24, x15
+          umulh x15, x7, x11
+          adc   x25, xzr, xzr
+          adds  x12, x22, x12
+          adcs  x13, x23, x13
+          adcs  x14, x24, x14
+          adc   x15, x25, x15
+
+          lsr count, x0, #5
+
+          ldrh q20, x1, #0, t0
+          ldrh q21, x1, #16, t1
+
+          // Precompute and cache differences required in the
+          // ADK multiplication conducted by the innerl oop.
+          // Save each difference (somewhat inefficiently)
+          // as a pair (t,c) of 64-bit + carry.
+          //
+          // The same caching trick is applied to the modulus,
+          // for which the various differences can even be hoisted
+          // out of the entire multiplication routine.
+
+          // a0 - a1 with carry
+          cdiff x16,x26,a0,a1
+          stph x16, x26, sp, #cache_a01, t01
+          // a0 - a2 with carry
+          cdiff x16,x26,a0,a2
+          stph x16, x26, sp, #cache_a02, t02
+          // a0 - a3 with carry
+          cdiff x16,x26,a0,a3
+          stph x16, x26, sp, #cache_a03, t03
+          // a1 - a2 with carry
+          cdiff x16,x26,a1,a2
+          stph x16, x26, sp, #cache_a12, t12
+          // a1 - a3 with carry
+          cdiff x16,x26,a1,a3
+          stph x16, x26, sp, #cache_a13, t13
+          // a2 - a3 with carry
+          cdiff x16,x26,a2,a3
+          stph x16, x26, sp, #cache_a23, t23
+
+          // Precompute and cache some precomputations for
+          // the Neon multiplications in the inner loop
+          uzp2    vpre00.4s, v20.4s, v20.4s
+          xtn     vpre01.2s, v20.2d
+          rev64   vpre02.4s, v20.4s
+          uzp2    vpre10.4s, v21.4s, v21.4s
+          xtn     vpre11.2s, v21.2d
+          rev64   vpre12.4s, v21.4s
+
+bignum_emontredc_8n_cdiff_base_maddloop_neon:
+
+          ldr q22, [x2, #32]!
+          ldr q23, [x2, #16]
+
+          xtn     v4.2s, v22.2d
+          umull   v6.2d, v4.2s, vpre01.2s
+          umull   v7.2d, v4.2s, vpre00.2s
+          uzp2    v16.4s, v22.4s, v22.4s
+          mul     v0.4s, vpre02.4s, v22.4s
+          usra    v7.2d, v6.2d, #32
+          umull   v25.2d, v16.2s, vpre00.2s
+          uaddlp  v0.2d, v0.4s
+          and     v2.16b, v7.16b, v29.16b
+          umlal   v2.2d, v16.2s, vpre01.2s
+          shl     v24.2d, v0.2d, #32
+          usra    v25.2d, v7.2d, #32
+          umlal   v24.2d, v4.2s, vpre01.2s
+          usra    v25.2d, v2.2d, #32
+
+          // Original version without caching
+          // uzp2    v3.4s, v22.4s, v22.4s
+          // xtn     v4.2s, v20.2d
+          // xtn     v5.2s, v22.2d
+          // rev64   v1.4s, v22.4s
+          // umull   v6.2d, v4.2s, v5.2s
+          // umull   v7.2d, v4.2s, v3.2s
+          // uzp2    v16.4s, v20.4s, v20.4s
+          // mul     v0.4s, v1.4s, v20.4s
+          // usra    v7.2d, v6.2d, #32
+          // umull   v25.2d, v16.2s, v3.2s
+          // uaddlp  v0.2d, v0.4s
+          // and     v2.16b, v7.16b, v29.16b
+          // umlal   v2.2d, v16.2s, v5.2s
+          // shl     v24.2d, v0.2d, #32
+          // usra    v25.2d, v7.2d, #32
+          // umlal   v24.2d, v4.2s, v5.2s
+          // usra    v25.2d, v2.2d, #32
+
+          xtn     v4.2s, v23.2d
+          umull   v6.2d, v4.2s, vpre11.2s
+          umull   v7.2d, v4.2s, vpre10.2s
+          uzp2    v16.4s, v23.4s, v23.4s
+          mul     v0.4s, vpre12.4s, v23.4s
+          usra    v7.2d, v6.2d, #32
+          umull   v27.2d, v16.2s, vpre10.2s
+          uaddlp  v0.2d, v0.4s
+          and     v2.16b, v7.16b, v29.16b
+          umlal   v2.2d, v16.2s, vpre11.2s
+          shl     v26.2d, v0.2d, #32
+          usra    v27.2d, v7.2d, #32
+          umlal   v26.2d, v4.2s, vpre11.2s
+          usra    v27.2d, v2.2d, #32
+
+          // Original version without caching
+          // uzp2    v3.4s, v23.4s, v23.4s
+          // xtn     v4.2s, v21.2d
+          // xtn     v5.2s, v23.2d
+          // rev64   v1.4s, v23.4s
+          // umull   v6.2d, v4.2s, v5.2s
+          // umull   v7.2d, v4.2s, v3.2s
+          // uzp2    v16.4s, v21.4s, v21.4s
+          // mul     v0.4s, v1.4s, v21.4s
+          // usra    v7.2d, v6.2d, #32
+          // umull   v27.2d, v16.2s, v3.2s
+          // uaddlp  v0.2d, v0.4s
+          // and     v2.16b, v7.16b, v29.16b
+          // umlal   v2.2d, v16.2s, v5.2s
+          // shl     v26.2d, v0.2d, #32
+          // usra    v27.2d, v7.2d, #32
+          // umlal   v26.2d, v4.2s, v5.2s
+          // usra    v27.2d, v2.2d, #32
+
+          mov x16, v25.d[0] // hi bits of (x4 * x8)
+          mov x26, v27.d[0] // hi bits of (x6 * x10)
+          mov x3,  v25.d[1] // hi bits of (x5 * x9)
+          mov x17, v27.d[1] // hi bits of (x6 * x10)
+
+          mov x20, v24.d[1] // lo bits of (x5 * x9)
+          mov x21, v26.d[0] // lo bits of (x6 * x10)
+          mov x24, v26.d[1] // lo bits of (x7 * x11)
+
+          // Not necessary if one uses cached differences for the modulus
+          //ldp x8,  x9,  [x2, #0]
+          //ldp x10, x11, [x2, #16]
+
+          adds x22, x20, x16
+          adcs x23, x21, x3
+          adcs x24, x24, x26
+          adc  x25, x17, xzr
+          mov  x17, v24.d[0] // lo bits of (x4 * x8)
+          ldp  x20, x21, [x1, #32]!
+          adds x12, x12, x20
+          adcs x13, x13, x21
+          ldp  x20, x21, [x1, #16]
+          adcs x14, x14, x20
+          adcs x15, x15, x21
+          adc  x16, xzr, xzr
+          adds x19, x22, x17
+          adcs x22, x23, x22
+          adcs x23, x24, x23
+          adcs x24, x25, x24
+          adc  x25, xzr, x25
+          adds x20, x22, x17
+          adcs x21, x23, x19
+          adcs x22, x24, x22
+          adcs x23, x25, x23
+          adcs x24, xzr, x24
+          adc  x25, xzr, x25
+          adds x17, x17, x12
+          adcs x19, x19, x13
+          adcs x20, x20, x14
+          adcs x21, x21, x15
+          adcs x22, x22, x16
+          adcs x23, x23, xzr
+          adcs x24, x24, xzr
+          adc x25, x25, xzr
+
+          ldph x15, x12, sp, #cache_a23, t23
+          // Original code without caching
+          //subs x15, x6, x7
+          //cneg x15, x15, cc
+          //csetm x12, cc
+
+          ldp x13, x14, [x30, #cache_m32]
+          eor x12, x12, x14
+          // Original code without caching
+          //cdiff x13, x14, x11, x10
+          //subs x13, x11, x10
+          //cneg x13, x13, cc
+
+          mul   x14, x15, x13
+          umulh x13, x15, x13
+          adds  xzr, x12, #1
+          eor   x14, x14, x12
+          adcs  x23, x23, x14
+          eor   x13, x13, x12
+          adcs  x24, x24, x13
+          adc   x25, x25, x12
+
+          ldph x15, x12, sp, #cache_a01, t01
+          //subs x15, x4, x5
+          //cneg x15, x15, cc
+          //csetm x12, cc
+
+          ldp x13, x14, [x30, #cache_m10]
+          eor x12, x12, x14
+          // Original code without caching
+          //subs x13, x9, x8
+          //cneg x13, x13, cc
+          //cinv x12, x12, cc
+
+          mul   x14, x15, x13
+          umulh x13, x15, x13
+          adds  xzr, x12, #1
+          eor   x14, x14, x12
+          adcs  x19, x19, x14
+          eor   x13, x13, x12
+          adcs  x20, x20, x13
+          adcs  x21, x21, x12
+          adcs  x22, x22, x12
+          adcs  x23, x23, x12
+          adcs  x24, x24, x12
+          adc   x25, x25, x12
+
+          stp x17, x19, [x1, #0]
+
+          ldph x15, x12, sp, #cache_a13, t13
+          //subs x15, x5, x7
+          //cneg x15, x15, cc
+          //csetm x12, cc
+
+          ldp x13, x14, [x30, #cache_m31]
+          eor x12, x12, x14
+          // Original code without caching
+          //subs x13, x11, x9
+          //cneg x13, x13, cc
+          //cinv x12, x12, cc
+
+          mul   x14, x15, x13
+          umulh x13, x15, x13
+          adds  xzr, x12, #1
+          eor   x14, x14, x12
+          adcs  x22, x22, x14
+          eor   x13, x13, x12
+          adcs  x23, x23, x13
+          adcs  x24, x24, x12
+          adc   x25, x25, x12
+
+          ldph x15, x12, sp, #cache_a02, t02
+          //subs x15, x4, x6
+          //cneg x15, x15, cc
+          //csetm x12, cc
+
+          ldp x13, x14, [x30, #cache_m20]
+          eor x12, x12, x14
+          // Original code without caching
+          //subs x13, x10, x8
+          //cneg x13, x13, cc
+          //cinv x12, x12, cc
+
+          mul   x14, x15, x13
+          umulh x13, x15, x13
+          adds  xzr, x12, #1
+          eor   x14, x14, x12
+          adcs  x20, x20, x14
+          eor   x13, x13, x12
+          adcs  x21, x21, x13
+          adcs  x22, x22, x12
+          adcs  x23, x23, x12
+          adcs  x24, x24, x12
+          adc   x25, x25, x12
+
+          ldph x15, x12, sp, #cache_a03, t03
+          //subs x15, x4, x7
+          //cneg x15, x15, cc
+          //csetm x12, cc
+
+          ldp x13, x14, [x30, #cache_m30]
+          eor x12, x12, x14
+          // Original code without caching
+          //subs x13, x11, x8
+          //cneg x13, x13, cc
+          //cinv x12, x12, cc
+
+          mul   x14, x15, x13
+          umulh x13, x15, x13
+          adds  xzr, x12, #1
+          eor   x14, x14, x12
+          adcs  x21, x21, x14
+          eor   x13, x13, x12
+          adcs  x22, x22, x13
+          adcs  x23, x23, x12
+          adcs  x24, x24, x12
+          adc   x25, x25, x12
+
+          ldph x15, x12, sp, #cache_a12, t12
+          //subs x15, x5, x6
+          //cneg x15, x15, cc
+          //csetm x12, cc
+
+          ldp x13, x14, [x30, #cache_m21]
+          eor x12, x12, x14
+          // Original code without caching
+          //subs x13, x10, x9
+          //cneg x13, x13, cc
+          //cinv x12, x12, cc
+
+          mul   x14, x15, x13
+          umulh x13, x15, x13
+          adds  xzr, x12, #1
+          eor   x14, x14, x12
+          adcs  x21, x21, x14
+
+          stp x20, x21, [x1, #16]
+          eor x13, x13, x12
+          adcs x22, x22, x13
+          adcs x13, x23, x12
+          adcs x14, x24, x12
+          adc x15, x25, x12
+          mov x12, x22
+
+          add x30, x30, #96
+
+          sub count, count, #1
+          cbnz count, bignum_emontredc_8n_cdiff_base_maddloop_neon
+
+          ldp x17, x19, [x1, #32]
+          ldp x20, x21, [x1, #48]
+          ldp x26, xzr, [sp, #16]
+          adds xzr, x28, x28
+          adcs x17, x17, x12
+          adcs x19, x19, x13
+          adcs x20, x20, x14
+          adcs x21, x21, x15
+          csetm x28, cs
+          stp x17, x19, [x1, #32]
+          stp x20, x21, [x1, #48]
+          sub x1, x1, x0
+          sub x2, x2, x0
+          add x1, x1, #32
+          subs x26, x26, #1
+          stp x26, xzr, [sp, #16]
+
+          // Restore buffer base for cached modulus differences
+          ldr x30, [sp, #8]
+
+          bne bignum_emontredc_8n_cdiff_base_outerloop
+          neg x0, x28
+
+bignum_emontredc_8n_cdiff_base_end:
+           add sp, sp, #32
+           add sp, sp, #(6*16)
+
+           ldp x29, x30, [sp, #(0*16)]
+           ldp x27, x28, [sp, #(1*16)]
+           ldp x25, x26, [sp, #(2*16)]
+           ldp x23, x24, [sp, #(3*16)]
+           ldp x21, x22, [sp, #(4*16)]
+           ldp x19, x20, [sp, #(5*16)]
+           add sp, sp, #(6*16)
+
+           ret
+
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_mul_8_16_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_mul_8_16_base.S
new file mode 100644
index 00000000000..d87eb806bc5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_mul_8_16_base.S
@@ -0,0 +1,349 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16_base
+//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_base)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Macro computing [c,b,a] := [b,a] + (x - y) * (w - z), adding with carry
+// to the [b,a] components but leaving CF aligned with the c term, which is
+// a sign bitmask for (x - y) * (w - z). Continued add-with-carry operations
+// with [c,...,c] will continue the carry chain correctly starting from
+// the c position if desired to add to a longer term of the form [...,b,a].
+//
+// c,h,l,t should all be different and t,h should not overlap w,z.
+// ---------------------------------------------------------------------------
+
+.macro muldiffnadd b,a, c,h,l,t, x,y, w,z
+        subs    \t, \x, \y
+        cneg    \t, \t, cc
+        csetm   \c, cc
+        subs    \h, \w, \z
+        cneg    \h, \h, cc
+        mul     \l, \t, \h
+        umulh   \h, \t, \h
+        cinv    \c, \c, cc
+        adds    xzr, \c, #1
+        eor     \l, \l, \c
+        adcs    \a, \a, \l
+        eor     \h, \h, \c
+        adcs    \b, \b, \h
+.endm
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define s0 x11
+#define s1 x12
+#define s2 x13
+#define s3 x14
+#define s4 x15
+#define s5 x16
+#define s6 x17
+#define s7 x19
+
+#define c  x20
+#define h  x21
+#define l  x22
+#define m  x23
+#define t  x24
+
+// These alias the ax and bx values, and are only used when they are done with
+
+#define u0  x3
+#define u1  x4
+#define u2  x5
+#define u3  x6
+#define u4  x7
+#define u5  x8
+#define u6  x9
+#define u7  x10
+
+// These alias c,h,l,m but leave s, t and d safe, all we need
+
+#define u8  x20
+#define u9  x21
+#define u10 x22
+#define u11 x23
+
+// We recycle the input pointers near the end
+
+#define s  x1
+#define d  x2
+
+// ---------------------------------------------------------------------------
+// Core 4x4->8 ADK multiplication macro
+// Does [s7,s6,s5,s4,s3,s2,s1,s0] = [a3,a2,a1,a0] * [b3,b2,b1,b0]
+//
+// If the input parameter is 1, it also adds in [z+32,z+40,z+48,z+56]
+// existing contents; if the parameter is 0 it just does the pure multiply
+// ---------------------------------------------------------------------------
+
+.macro  mul4 afl
+
+// First accumulate all the "simple" products as [s7,s6,s5,s4,s0]
+
+        mul     s0, a0, b0
+        mul     s4, a1, b1
+        mul     s5, a2, b2
+        mul     s6, a3, b3
+
+        umulh   s7, a0, b0
+        adds    s4, s4, s7
+        umulh   s7, a1, b1
+        adcs    s5, s5, s7
+        umulh   s7, a2, b2
+        adcs    s6, s6, s7
+        umulh   s7, a3, b3
+        adc     s7, s7, xzr
+
+// Multiply by B + 1 to get [s7;s6;s5;s4;s1;s0]
+
+        adds    s1, s4, s0
+        adcs    s4, s5, s4
+        adcs    s5, s6, s5
+        adcs    s6, s7, s6
+        adc     s7, xzr, s7
+
+// Multiply by B^2 + 1 to get [s7;s6;s5;s4;s3;s2;s1;s0]
+
+        adds    s2, s4, s0
+        adcs    s3, s5, s1
+        adcs    s4, s6, s4
+        adcs    s5, s7, s5
+        adcs    s6, xzr, s6
+        adc     s7, xzr, s7
+
+// Optionally add the existing z contents
+
+.rep \afl
+        ldp     l, h, [z,#32]
+        adds    s0, s0, l
+        adcs    s1, s1, h
+        ldp     l, h, [z,#48]
+        adcs    s2, s2, l
+        adcs    s3, s3, h
+        adcs    s4, s4, xzr
+        adcs    s5, s5, xzr
+        adcs    s6, s6, xzr
+        adc     s7, s7, xzr
+.endr
+
+// Now add in all the "complicated" terms.
+
+        muldiffnadd s6,s5, c,h,l,t, a2,a3, b3,b2
+        adc     s7, s7, c
+
+        muldiffnadd s2,s1, c,h,l,t, a0,a1, b1,b0
+        adcs    s3, s3, c
+        adcs    s4, s4, c
+        adcs    s5, s5, c
+        adcs    s6, s6, c
+        adc     s7, s7, c
+
+        muldiffnadd s5,s4, c,h,l,t, a1,a3, b3,b1
+        adcs    s6, s6, c
+        adc     s7, s7, c
+
+        muldiffnadd s3,s2, c,h,l,t, a0,a2, b2,b0
+        adcs    s4, s4, c
+        adcs    s5, s5, c
+        adcs    s6, s6, c
+        adc     s7, s7, c
+
+        muldiffnadd s4,s3, c,h,l,t, a0,a3, b3,b0
+        adcs    s5, s5, c
+        adcs    s6, s6, c
+        adc     s7, s7, c
+        muldiffnadd s4,s3, c,h,l,t, a1,a2, b2,b1
+        adcs    s5, s5, c
+        adcs    s6, s6, c
+        adc     s7, s7, c
+.endm
+
+// ---------------------------------------------------------------------------
+// The main code
+// ---------------------------------------------------------------------------
+
+S2N_BN_SYMBOL(bignum_mul_8_16_base):
+
+// Save registers
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+
+// Multiply the low halves and then the high halves using ADK 4x4->8.
+// For the second one add the top of the low part (Q1) already into
+// the bottom of the high part (Q2) so that is already dealt with.
+//
+// Write back the first one but defer the second till a bit later while
+// we get on with the absolute difference computations
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+        ldp     a2, a3, [x, #16]
+        ldp     b2, b3, [y, #16]
+
+        mul4    0
+
+        ldp     a0, a1, [x, #32]
+        stp     s0, s1, [z]
+        ldp     b0, b1, [y, #32]
+        stp     s2, s3, [z, #16]
+        ldp     a2, a3, [x, #48]
+        stp     s4, s5, [z, #32]
+        ldp     b2, b3, [y, #48]
+        stp     s6, s7, [z, #48]
+
+        mul4    1
+
+// Compute t,[a3,a2,a1,a0] = x_hi - x_lo
+// and     s,[b3,b2,b1,b0] = y_lo - y_hi
+// sign-magnitude differences, and scatter in belated high writeback
+
+        ldp     l, h, [x]
+        subs    a0, a0, l
+        sbcs    a1, a1, h
+        ldp     l, h, [x, #16]
+        sbcs    a2, a2, l
+        sbcs    a3, a3, h
+        csetm   t, cc
+
+        stp     s0, s1, [z, #64]
+
+        ldp     l, h, [y]
+        subs    b0, l, b0
+        sbcs    b1, h, b1
+        ldp     l, h, [y, #16]
+        sbcs    b2, l, b2
+        sbcs    b3, h, b3
+        csetm   s, cc
+
+        stp     s2, s3, [z, #80]
+
+        eor     a0, a0, t
+        subs    a0, a0, t
+        eor     a1, a1, t
+        sbcs    a1, a1, t
+        eor     a2, a2, t
+        sbcs    a2, a2, t
+        eor     a3, a3, t
+        sbc     a3, a3, t
+
+        stp     s4, s5, [z, #96]
+
+        eor     b0, b0, s
+        subs    b0, b0, s
+        eor     b1, b1, s
+        sbcs    b1, b1, s
+        eor     b2, b2, s
+        sbcs    b2, b2, s
+        eor     b3, b3, s
+        sbc     b3, b3, s
+
+        stp     s6, s7, [z, #112]
+
+// Save the correct sign for the sub-product
+
+        eor     s, s, t
+
+// Now yet another 4x4->8 ADK core, but not writing back, keeping s0..s7
+
+        mul4    0
+
+// Now accumulate the positive mid-terms as [u7,u6,u5,u4,u3.u2,u1,u0]
+
+        ldp     u0, u1, [z]
+        ldp     u4, u5, [z,#64]
+        adds    u0, u0, u4
+        adcs    u1, u1, u5
+        ldp     u2, u3, [z,#16]
+        ldp     u6, u7, [z,#80]
+        adcs    u2, u2, u6
+        adcs    u3, u3, u7
+        ldp     u8, u9, [z,#96]
+        adcs    u4, u4, u8
+        adcs    u5, u5, u9
+        ldp     u10, u11, [z,#112]
+        adcs    u6, u6, u10
+        adcs    u7, u7, u11
+
+// Stop the carry here so we can reintroduce it, taking into account the
+// effective addition of s from sign-extension below. Note that we get
+// a duplicated word c+carry beyond the first one, so this upper part is
+// of the form [d,d,d,t].
+
+        adcs    t, s, xzr
+        adc     d, s, xzr
+
+// Add in the sign-adjusted complex term
+
+        adds    xzr, s, #1
+        eor     s0, s0, s
+        adcs    u0, s0, u0
+        eor     s1, s1, s
+        adcs    u1, s1, u1
+        eor     s2, s2, s
+        adcs    u2, s2, u2
+        eor     s3, s3, s
+        adcs    u3, s3, u3
+        eor     s4, s4, s
+        adcs    u4, s4, u4
+        eor     s5, s5, s
+        adcs    u5, s5, u5
+        eor     s6, s6, s
+        adcs    u6, s6, u6
+        eor     s7, s7, s
+        adcs    u7, s7, u7
+
+// From this point on replace the sign with the suspended carry indication
+
+        adcs    u8, u8, t
+        adcs    u9, u9, d
+        adcs    u10, u10, d
+        adc     u11, u11, d
+
+// Store it back
+
+        stp     u0, u1, [z,#32]
+        stp     u2, u3, [z,#48]
+        stp     u4, u5, [z,#64]
+        stp     u6, u7, [z,#80]
+        stp     u8, u9, [z,#96]
+        stp     u10, u11, [z,#112]
+
+// Restore regs and return
+
+        ldp     x23, x24, [sp], #16
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_sqr_8_16_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_sqr_8_16_base.S
new file mode 100644
index 00000000000..3cd6ecf68bc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_sqr_8_16_base.S
@@ -0,0 +1,356 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16_base (uint64_t z[static 16], uint64_t x[static 8]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_base)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Macro computing [c,b,a] := [b,a] + (x - y) * (w - z), adding with carry
+// to the [b,a] components but leaving CF aligned with the c term, which is
+// a sign bitmask for (x - y) * (w - z). Continued add-with-carry operations
+// with [c,...,c] will continue the carry chain correctly starting from
+// the c position if desired to add to a longer term of the form [...,b,a].
+//
+// c,h,l,t should all be different and t,h should not overlap w,z.
+// ---------------------------------------------------------------------------
+
+.macro muldiffnadd b,a, c,h,l,t, x,y, w,z
+        subs    \t, \x, \y
+        cneg    \t, \t, cc
+        csetm   \c, cc
+        subs    \h, \w, \z
+        cneg    \h, \h, cc
+        mul     \l, \t, \h
+        umulh   \h, \t, \h
+        cinv    \c, \c, cc
+        adds    xzr, \c, #1
+        eor     \l, \l, \c
+        adcs    \a, \a, \l
+        eor     \h, \h, \c
+        adcs    \b, \b, \h
+.endm
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+#define b0 x6
+#define b1 x7
+#define b2 x8
+#define b3 x9
+
+#define s0 x10
+#define s1 x11
+#define s2 x12
+#define s3 x13
+#define s4 x14
+#define s5 x15
+#define s6 x16
+#define s7 x17
+
+#define c  x19
+#define h  x20
+#define l  x21
+#define t  x22
+
+// ---------------------------------------------------------------------------
+// Core 4x4->8 ADK multiplication macro
+// Does [s7,s6,s5,s4,s3,s2,s1,s0] = [a3,a2,a1,a0] * [b3,b2,b1,b0]
+// ---------------------------------------------------------------------------
+
+.macro  mul4
+
+// First accumulate all the "simple" products as [s7,s6,s5,s4,s0]
+
+        mul     s0, a0, b0
+        mul     s4, a1, b1
+        mul     s5, a2, b2
+        mul     s6, a3, b3
+
+        umulh   s7, a0, b0
+        adds    s4, s4, s7
+        umulh   s7, a1, b1
+        adcs    s5, s5, s7
+        umulh   s7, a2, b2
+        adcs    s6, s6, s7
+        umulh   s7, a3, b3
+        adc     s7, s7, xzr
+
+// Multiply by B + 1 to get [s7;s6;s5;s4;s1;s0]
+
+        adds    s1, s4, s0
+        adcs    s4, s5, s4
+        adcs    s5, s6, s5
+        adcs    s6, s7, s6
+        adc     s7, xzr, s7
+
+// Multiply by B^2 + 1 to get [s7;s6;s5;s4;s3;s2;s1;s0]
+
+        adds    s2, s4, s0
+        adcs    s3, s5, s1
+        adcs    s4, s6, s4
+        adcs    s5, s7, s5
+        adcs    s6, xzr, s6
+        adc     s7, xzr, s7
+
+// Now add in all the "complicated" terms.
+
+        muldiffnadd s6,s5, c,h,l,t, a2,a3, b3,b2
+        adc     s7, s7, c
+
+        muldiffnadd s2,s1, c,h,l,t, a0,a1, b1,b0
+        adcs    s3, s3, c
+        adcs    s4, s4, c
+        adcs    s5, s5, c
+        adcs    s6, s6, c
+        adc     s7, s7, c
+
+        muldiffnadd s5,s4, c,h,l,t, a1,a3, b3,b1
+        adcs    s6, s6, c
+        adc     s7, s7, c
+
+        muldiffnadd s3,s2, c,h,l,t, a0,a2, b2,b0
+        adcs    s4, s4, c
+        adcs    s5, s5, c
+        adcs    s6, s6, c
+        adc     s7, s7, c
+
+        muldiffnadd s4,s3, c,h,l,t, a0,a3, b3,b0
+        adcs    s5, s5, c
+        adcs    s6, s6, c
+        adc     s7, s7, c
+        muldiffnadd s4,s3, c,h,l,t, a1,a2, b2,b1
+        adcs    s5, s5, c
+        adcs    s6, s6, c
+        adc     s7, s7, c
+.endm
+
+// ---------------------------------------------------------------------------
+// The main code
+// ---------------------------------------------------------------------------
+
+S2N_BN_SYMBOL(bignum_sqr_8_16_base):
+
+// Save registers
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+
+// Load all the inputs first
+
+        ldp     a0, a1, [x]
+        ldp     a2, a3, [x, #16]
+        ldp     b0, b1, [x, #32]
+        ldp     b2, b3, [x, #48]
+
+// Square the lower half with a near-clone of bignum_sqr_4_8
+
+        mul     x17, x2, x4
+        mul     x14, x3, x5
+        umulh   x20, x2, x4
+        subs    x21, x2, x3
+        cneg    x21, x21, cc
+        csetm   x11, cc
+        subs    x12, x5, x4
+        cneg    x12, x12, cc
+        mul     x13, x21, x12
+        umulh   x12, x21, x12
+        cinv    x11, x11, cc
+        eor     x13, x13, x11
+        eor     x12, x12, x11
+        adds    x19, x17, x20
+        adc     x20, x20, xzr
+        umulh   x21, x3, x5
+        adds    x19, x19, x14
+        adcs    x20, x20, x21
+        adc     x21, x21, xzr
+        adds    x20, x20, x14
+        adc     x21, x21, xzr
+        cmn     x11, #0x1
+        adcs    x19, x19, x13
+        adcs    x20, x20, x12
+        adc     x21, x21, x11
+        adds    x17, x17, x17
+        adcs    x19, x19, x19
+        adcs    x20, x20, x20
+        adcs    x21, x21, x21
+        adc     x10, xzr, xzr
+        mul     x12, x2, x2
+        mul     x13, x3, x3
+        mul     x15, x2, x3
+        umulh   x11, x2, x2
+        umulh   x14, x3, x3
+        umulh   x16, x2, x3
+        adds    x11, x11, x15
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        adds    x11, x11, x15
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        stp     x12, x11, [z]
+        adds    x17, x17, x13
+        adcs    x19, x19, x14
+        adcs    x20, x20, xzr
+        adcs    x21, x21, xzr
+        adc     x10, x10, xzr
+        stp     x17, x19, [z, #16]
+        mul     x12, x4, x4
+        mul     x13, x5, x5
+        mul     x15, x4, x5
+        umulh   x11, x4, x4
+        umulh   x14, x5, x5
+        umulh   x16, x4, x5
+        adds    x11, x11, x15
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        adds    x11, x11, x15
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        adds    x12, x12, x20
+        adcs    x11, x11, x21
+        stp     x12, x11, [z, #32]
+        adcs    x13, x13, x10
+        adc     x14, x14, xzr
+        stp     x13, x14, [z, #48]
+
+// Square the upper half with a slight variant of the previous block
+
+        mul     x17, x6, x8
+        mul     x14, x7, x9
+        umulh   x20, x6, x8
+        subs    x21, x6, x7
+        cneg    x21, x21, cc
+        csetm   x11, cc
+        subs    x12, x9, x8
+        cneg    x12, x12, cc
+        mul     x13, x21, x12
+        umulh   x12, x21, x12
+        cinv    x11, x11, cc
+        eor     x13, x13, x11
+        eor     x12, x12, x11
+        adds    x19, x17, x20
+        adc     x20, x20, xzr
+        umulh   x21, x7, x9
+        adds    x19, x19, x14
+        adcs    x20, x20, x21
+        adc     x21, x21, xzr
+        adds    x20, x20, x14
+        adc     x21, x21, xzr
+        cmn     x11, #0x1
+        adcs    x19, x19, x13
+        adcs    x20, x20, x12
+        adc     x21, x21, x11
+        adds    x17, x17, x17
+        adcs    x19, x19, x19
+        adcs    x20, x20, x20
+        adcs    x21, x21, x21
+        adc     x10, xzr, xzr
+        mul     x12, x6, x6
+        mul     x13, x7, x7
+        mul     x15, x6, x7
+        umulh   x11, x6, x6
+        umulh   x14, x7, x7
+        umulh   x16, x6, x7
+        adds    x11, x11, x15
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        adds    x11, x11, x15
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        stp     x12, x11, [z, #64]
+        adds    x17, x17, x13
+        adcs    x19, x19, x14
+        adcs    x20, x20, xzr
+        adcs    x21, x21, xzr
+        adc     x10, x10, xzr
+        stp     x17, x19, [z, #80]
+        mul     x12, x8, x8
+        mul     x13, x9, x9
+        mul     x15, x8, x9
+        umulh   x11, x8, x8
+        umulh   x14, x9, x9
+        umulh   x16, x8, x9
+        adds    x11, x11, x15
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        adds    x11, x11, x15
+        adcs    x13, x13, x16
+        adc     x14, x14, xzr
+        adds    x12, x12, x20
+        adcs    x11, x11, x21
+        stp     x12, x11, [z, #96]
+        adcs    x13, x13, x10
+        adc     x14, x14, xzr
+        stp     x13, x14, [z, #112]
+
+// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0]
+
+        mul4
+
+        adds    s0, s0, s0
+        adcs    s1, s1, s1
+        adcs    s2, s2, s2
+        adcs    s3, s3, s3
+        adcs    s4, s4, s4
+        adcs    s5, s5, s5
+        adcs    s6, s6, s6
+        adcs    s7, s7, s7
+        adc     c, xzr, xzr
+
+// Add it back to the buffer
+
+        ldp     a0, a1, [z, #32]
+        adds    s0, s0, a0
+        adcs    s1, s1, a1
+        stp     s0, s1, [z, #32]
+
+        ldp     a0, a1, [z, #48]
+        adcs    s2, s2, a0
+        adcs    s3, s3, a1
+        stp     s2, s3, [z, #48]
+
+        ldp     a0, a1, [z, #64]
+        adcs    s4, s4, a0
+        adcs    s5, s5, a1
+        stp     s4, s5, [z, #64]
+
+        ldp     a0, a1, [z, #80]
+        adcs    s6, s6, a0
+        adcs    s7, s7, a1
+        stp     s6, s7, [z, #80]
+
+        ldp     a0, a1, [z, #96]
+        adcs    a0, a0, c
+        adcs    a1, a1, xzr
+        stp     a0, a1, [z, #96]
+
+        ldp     a0, a1, [z, #112]
+        adcs    a0, a0, xzr
+        adc     a1, a1, xzr
+        stp     a0, a1, [z, #112]
+
+// Restore regs and return
+
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/Makefile
new file mode 100644
index 00000000000..c81b9239471
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/Makefile
@@ -0,0 +1,103 @@
+#############################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+#############################################################################
+
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
+# use a cross-assembling version so that the code can still be assembled
+# and the proofs checked against the object files (though you won't be able
+# to run code without additional emulation infrastructure). The aarch64
+# cross-assembling version can be installed manually by something like:
+#
+#  sudo apt-get install binutils-aarch64-linux-gnu
+
+UNAME_RESULT=$(shell uname -p)
+
+ifeq ($(UNAME_RESULT),aarch64)
+GAS=as
+else
+GAS=aarch64-linux-gnu-as
+endif
+
+# List of object files
+
+OBJ = bignum_add.o \
+      bignum_amontifier.o \
+      bignum_amontmul.o \
+      bignum_amontredc.o \
+      bignum_amontsqr.o \
+      bignum_bitfield.o \
+      bignum_bitsize.o \
+      bignum_cdiv.o \
+      bignum_cdiv_exact.o \
+      bignum_cld.o \
+      bignum_clz.o \
+      bignum_cmadd.o \
+      bignum_cmnegadd.o \
+      bignum_cmod.o \
+      bignum_cmul.o \
+      bignum_coprime.o \
+      bignum_copy.o \
+      bignum_copy_row_from_table.o \
+      bignum_copy_row_from_table_8n.o \
+      bignum_copy_row_from_table_16.o \
+      bignum_copy_row_from_table_32.o \
+      bignum_ctd.o \
+      bignum_ctz.o \
+      bignum_demont.o \
+      bignum_digit.o \
+      bignum_digitsize.o \
+      bignum_divmod10.o \
+      bignum_emontredc.o \
+      bignum_eq.o \
+      bignum_even.o \
+      bignum_ge.o \
+      bignum_gt.o \
+      bignum_iszero.o \
+      bignum_le.o \
+      bignum_lt.o \
+      bignum_madd.o \
+      bignum_modadd.o \
+      bignum_moddouble.o \
+      bignum_modexp.o \
+      bignum_modifier.o \
+      bignum_modinv.o \
+      bignum_modoptneg.o \
+      bignum_modsub.o \
+      bignum_montifier.o \
+      bignum_montmul.o \
+      bignum_montredc.o \
+      bignum_montsqr.o \
+      bignum_mul.o \
+      bignum_muladd10.o \
+      bignum_mux.o \
+      bignum_mux16.o \
+      bignum_negmodinv.o \
+      bignum_nonzero.o \
+      bignum_normalize.o \
+      bignum_odd.o \
+      bignum_of_word.o \
+      bignum_optadd.o \
+      bignum_optneg.o \
+      bignum_optsub.o \
+      bignum_optsubadd.o \
+      bignum_pow2.o \
+      bignum_shl_small.o \
+      bignum_shr_small.o \
+      bignum_sqr.o \
+      bignum_sub.o \
+      word_bytereverse.o \
+      word_clz.o \
+      word_ctz.o \
+      word_divstep59.o \
+      word_max.o \
+      word_min.o \
+      word_negmodinv.o \
+      word_popcount.o \
+      word_recip.o
+
+%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ -
+
+default: $(OBJ);
+
+clean:; rm -f *.o *.correct
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_add.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_add.S
new file mode 100644
index 00000000000..05d3487ddee
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_add.S
@@ -0,0 +1,121 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add, z := x + y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+//
+//    extern uint64_t bignum_add
+//     (uint64_t p, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the z := x + y operation, truncating modulo p words in general and
+// returning a top carry (0 or 1) in the p'th place, only adding the input
+// words below p (as well as m and n respectively) to get the sum and carry.
+//
+// Standard ARM ABI: X0 = p, X1 = z, X2 = m, X3 = x, X4 = n, X5 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add)
+        .text
+        .balign 4
+
+#define p x0
+#define z x1
+#define m x2
+#define x x3
+#define n x4
+#define y x5
+#define i x6
+#define a x7
+#define d x8
+
+
+S2N_BN_SYMBOL(bignum_add):
+
+// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
+// we'll never need words past the p'th. Can now assume m <= p and n <= p.
+// Then compare the modified m and n and branch accordingly
+
+        cmp     m, p
+        csel    m, p, m, cs
+        cmp     n, p
+        csel    n, p, n, cs
+        cmp     m, n
+        bcc     bignum_add_ylonger
+
+// The case where x is longer or of the same size (p >= m >= n)
+
+        sub     p, p, m
+        sub     m, m, n
+        ands    i, xzr, xzr
+        cbz     n, bignum_add_xmainskip
+bignum_add_xmainloop:
+        ldr     a, [x, i, lsl #3]
+        ldr     d, [y, i, lsl #3]
+        adcs    a, a, d
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_add_xmainloop
+bignum_add_xmainskip:
+        cbz     m, bignum_add_xtopskip
+bignum_add_xtoploop:
+        ldr     a, [x, i, lsl #3]
+        adcs    a, a, xzr
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_add_xtoploop
+bignum_add_xtopskip:
+        cbnz    p, bignum_add_tails
+        cset    x0, cs
+        ret
+
+// The case where y is longer (p >= n > m)
+
+bignum_add_ylonger:
+        sub     p, p, n
+        sub     n, n, m
+        ands    i, xzr, xzr
+        cbz     m, bignum_add_ytoploop
+bignum_add_ymainloop:
+        ldr     a, [x, i, lsl #3]
+        ldr     d, [y, i, lsl #3]
+        adcs    a, a, d
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_add_ymainloop
+bignum_add_ytoploop:
+        ldr     a, [y, i, lsl #3]
+        adcs    a, xzr, a
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_add_ytoploop
+bignum_add_ytopskip:
+        cbnz    p, bignum_add_tails
+        cset    x0, cs
+        ret
+
+// Adding a non-trivial tail, when p > max(m,n)
+
+bignum_add_tails:
+        cset    a, cs
+        str     a, [z, i, lsl #3]
+        b       bignum_add_tail
+bignum_add_tailloop:
+        str     xzr, [z, i, lsl #3]
+bignum_add_tail:
+        add     i, i, #1
+        sub     p, p, #1
+        cbnz    p, bignum_add_tailloop
+        mov     x0, xzr
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontifier.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontifier.S
new file mode 100644
index 00000000000..c5b1a9d9d52
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontifier.S
@@ -0,0 +1,386 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compute "amontification" constant z :== 2^{128k} (congruent mod m)
+// Input m[k]; output z[k]; temporary buffer t[>=k]
+//
+//    extern void bignum_amontifier
+//     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+//
+// This is called "amontifier" because any other value x can now be mapped into
+// the almost-Montgomery domain with an almost-Montgomery multiplication by z.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = t
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontifier)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontifier)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define m x2
+#define t x3
+
+// Some variables
+
+#define i x4
+#define j x5
+#define h x6
+#define a x7
+#define l x8
+#define c x9
+#define b x10
+#define d x11
+
+// Some aliases for the values b and d
+
+#define r x10
+#define q x11
+
+
+S2N_BN_SYMBOL(bignum_amontifier):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_amontifier_end
+
+// Copy the input m into the temporary buffer t. The temporary register
+// c matters since we want it to hold the highest digit, ready for the
+// normalization phase.
+
+        mov     i, xzr
+bignum_amontifier_copyinloop:
+        ldr     c, [m, i, lsl #3]
+        str     c, [t, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_amontifier_copyinloop
+
+// Do a rather stupid but constant-time digit normalization, conditionally
+// shifting left (k-1) times based on whether the top word is zero.
+// With careful binary striding this could be O(k*log(k)) instead of O(k^2)
+// while still retaining the constant-time style.
+// The "cmp c, xzr" sets the zeroness predicate (ZF) for the entire inner loop
+
+        subs    i, k, #1
+        beq     bignum_amontifier_normalized
+bignum_amontifier_normloop:
+        mov     j, xzr
+        cmp     c, xzr
+        mov     a, xzr
+bignum_amontifier_shufloop:
+        mov     c, a
+        ldr     a, [t, j, lsl #3]
+        csel    c, c, a, eq
+        str     c, [t, j, lsl #3]
+        add     j, j, #1
+        sub     d, j, k
+        cbnz    d, bignum_amontifier_shufloop
+        subs    i, i, #1
+        bne     bignum_amontifier_normloop
+
+// We now have the top digit nonzero, assuming the input was nonzero,
+// and as per the invariant of the loop above, c holds that digit. So
+// now just count c's leading zeros and shift t bitwise that many bits.
+
+bignum_amontifier_normalized:
+        clz     c, c
+
+        mov     b, xzr
+        mov     i, xzr
+        ands    xzr, c, #63
+        csetm   l, ne
+        neg     d, c
+bignum_amontifier_bitloop:
+        ldr     j, [t, i, lsl #3]
+        lsl     a, j, c
+        orr     a, a, b
+        lsr     b, j, d
+        and     b, b, l
+        str     a, [t, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_amontifier_bitloop
+
+// Let h be the high word of n, which in all the in-scope cases is >= 2^63.
+// Now successively form q = 2^i div h and r = 2^i mod h as i goes from
+// 64 to 126. We avoid just using division out of constant-time concerns
+// (at the least we would need to fix up h = 0 for out-of-scope inputs) and
+// don't bother with Newton-Raphson, since this stupid simple loop doesn't
+// contribute much of the overall runtime at typical sizes.
+
+        sub     h, k, #1
+        ldr     h, [t, h, lsl #3]
+        mov     q, #1
+        neg     r, h
+        mov     i, #62
+bignum_amontifier_estloop:
+        add     q, q, q
+        mov     a, h
+        sub     a, a, r
+        cmp     r, a    // CF <=> r >= h - r <=> 2 * r >= h
+        csetm   a, cs
+        sub     q, q, a
+        add     r, r, r
+        and     a, a, h
+        sub     r, r, a
+        subs    i, i, #1
+        bne     bignum_amontifier_estloop
+
+// Strictly speaking the above loop doesn't quite give the true remainder
+// and quotient in the special case r = h = 2^63, so fix it up. We get
+// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is
+// supererogatory, because the main property of q used below still holds
+// in this case unless the initial m = 1, and then anyway the overall
+// specification (congruence modulo m) holds degenerately. But it seems
+// nicer to get a "true" quotient and remainder.
+
+        cmp     r, h
+        csinc   q, q, q, ne
+
+// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the
+// fixed-up case above: note that we never actually use the computed
+// value of r below and so didn't adjust it). And we can assume the ranges
+// q <= 2^63 and r < h < 2^64.
+//
+// The idea is to use q as a first quotient estimate for a remainder
+// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the
+// high and low parts h and l:
+//
+// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l)
+//                  = 2^{p+62} - (2^{p-64} * (q * h) + q * l)
+//                  = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l
+//                  = 2^{p-64} * r - q * l
+//
+// Note that 2^{p-64} * r < 2^{p-64} * h <= n
+// and also  q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n
+// so |diff| = |2^{p-64} * r - q * l| < n.
+//
+// If in fact diff >= 0 then it is already 2^{p+62} mod n.
+// otherwise diff + n is the right answer.
+//
+// To (maybe?) make the computation slightly easier we actually flip
+// the sign and compute d = q * n - 2^{p+62}. Then the answer is either
+// -d (when negative) or n - d; in either case we effectively negate d.
+// This negating tweak in fact spoils the result for cases where
+// 2^{p+62} mod n = 0, when we get n instead. However the only case
+// where this can happen is m = 1, when the whole spec holds trivially,
+// and actually the remainder of the logic below works anyway since
+// the latter part of the code only needs a congruence for the k-digit
+// result, not strict modular reduction (the doublings will maintain
+// the non-strict inequality).
+
+        mov     c, xzr
+        adds    i, xzr, xzr
+bignum_amontifier_mulloop:
+        ldr     a, [t, i, lsl #3]
+        mul     l, q, a
+        adcs    l, l, c
+        umulh   c, q, a
+        str     l, [z, i, lsl #3]
+        add     i, i, #1
+        sub     a, i, k
+        cbnz    a, bignum_amontifier_mulloop
+
+        adc     c, c, xzr
+        mov     a, #0x4000000000000000
+        subs    c, c, a
+        csetm   q, cs
+
+// Now do [c] * n - d for our final answer
+
+        subs    i, xzr, xzr
+bignum_amontifier_remloop:
+        ldr     a, [t, i, lsl #3]
+        ldr     b, [z, i, lsl #3]
+        and     a, a, q
+        sbcs    a, a, b
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     a, i, k
+        cbnz    a, bignum_amontifier_remloop
+
+// Now still need to do a couple of modular doublings to get us all the
+// way up to 2^{p+64} == r from the initial 2^{p+62} == r (mod n).
+
+        mov     c, xzr
+        subs    j, xzr, xzr
+bignum_amontifier_dubloop1:
+        ldr     a, [z, j, lsl #3]
+        extr    c, a, c, #63
+        ldr     b, [t, j, lsl #3]
+        sbcs    c, c, b
+        str     c, [z, j, lsl #3]
+        mov     c, a
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontifier_dubloop1
+        lsr     c, c, #63
+        sbc     c, c, xzr
+        adds    j, xzr, xzr
+bignum_amontifier_corrloop1:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [t, j, lsl #3]
+        and     b, b, c
+        adcs    a, a, b
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontifier_corrloop1
+
+// This is not exactly the same: we also copy output to t giving the
+// initialization t_1 = r == 2^{p+64} mod n for the main loop next.
+
+        mov     c, xzr
+        subs    j, xzr, xzr
+bignum_amontifier_dubloop2:
+        ldr     a, [z, j, lsl #3]
+        extr    c, a, c, #63
+        ldr     b, [t, j, lsl #3]
+        sbcs    c, c, b
+        str     c, [z, j, lsl #3]
+        mov     c, a
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontifier_dubloop2
+        lsr     c, c, #63
+        sbc     c, c, xzr
+        adds    j, xzr, xzr
+bignum_amontifier_corrloop2:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [t, j, lsl #3]
+        and     b, b, c
+        adcs    a, a, b
+        str     a, [z, j, lsl #3]
+        str     a, [t, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontifier_corrloop2
+
+// We then successively generate (k+1)-digit values satisfying
+// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish
+// initialization by zeroing h initially
+
+        mov     h, xzr
+
+// Then if t_i = 2^{p} * h + l
+// we have t_{i+1} == 2^64 * t_i
+//         = (2^{p+64} * h) + (2^64 * l)
+//        == r * h + l<<64
+// Do this k more times so we end up == 2^{128*k+64}, one more than we want
+//
+// Writing B = 2^{64k}, the possible correction of adding r, which for
+// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r
+// would give the overall worst-case value minus q of
+// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r]
+// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required.
+//
+// This implementation makes the shift implicit by starting b with the
+// "previous" digit (initially 0) to offset things by 1.
+
+        mov     i, k
+bignum_amontifier_modloop:
+        mov     j, xzr
+        mov     b, xzr
+        adds    c, xzr, xzr
+bignum_amontifier_cmaloop:
+        ldr     a, [z, j, lsl #3]
+        mul     l, h, a
+        adcs    b, b, c
+        umulh   c, h, a
+        adc     c, c, xzr
+        adds    l, b, l
+        ldr     b, [t, j, lsl #3]
+        str     l, [t, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontifier_cmaloop
+
+        adcs    h, b, c
+
+        csetm   l, cs
+
+        adds    j, xzr, xzr
+bignum_amontifier_oaloop:
+        ldr     a, [t, j, lsl #3]
+        ldr     b, [z, j, lsl #3]
+        and     b, b, l
+        adcs    a, a, b
+        str     a, [t, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontifier_oaloop
+        adc     h, h, xzr
+
+        subs    i, i, #1
+        bne     bignum_amontifier_modloop
+
+// Now do one almost-Montgomery reduction w.r.t. the original m
+// which lops off one 2^64 from the congruence and, with the usual
+// almost-Montgomery correction, gets us back inside k digits for
+// the end result.
+
+        ldr     a, [m]
+        lsl     d, a, #2
+        sub     d, a, d
+        eor     d, d, #2
+        mov     l, #1
+        madd    c, a, d, l
+        mul     b, c, c
+        madd    d, c, d, d
+        mul     c, b, b
+        madd    d, b, d, d
+        mul     b, c, c
+        madd    d, c, d, d
+        madd    d, b, d, d
+
+        ldr     b, [t]
+        mul     d, b, d
+
+        mul     l, d, a
+        umulh   c, d, a
+        mov     j, #1
+        sub     a, k, #1
+        adds    xzr, b, l
+        cbz     a, bignum_amontifier_montend
+
+bignum_amontifier_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     b, [t, j, lsl #3]
+        mul     l, d, a
+        adcs    b, b, c
+        umulh   c, d, a
+        adc     c, c, xzr
+        adds    b, b, l
+        sub     a, j, #1
+        str     b, [t, a, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontifier_montloop
+bignum_amontifier_montend:
+        adcs    h, h, c
+        csetm   l, cs
+        sub     a, k, #1
+        str     h, [t, a, lsl #3]
+
+        subs    j, xzr, xzr
+bignum_amontifier_osloop:
+        ldr     a, [t, j, lsl #3]
+        ldr     b, [m, j, lsl #3]
+        and     b, b, l
+        sbcs    a, a, b
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontifier_osloop
+
+bignum_amontifier_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontmul.S
new file mode 100644
index 00000000000..79e4a5f03ef
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontmul.S
@@ -0,0 +1,180 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m)
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_amontmul
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+//
+// Does z :== (x * y / 2^{64k}) mod m, meaning that the result, in the native
+// size k, is congruent modulo m, but might not be fully reduced mod m. This
+// is why it is called *almost* Montgomery multiplication.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = y, X4 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontmul)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define y x3
+#define m x4
+
+// Negated modular inverse
+#define w x5
+// Top carry for k'th position
+#define c0 x6
+// Additional top carry for (k+1)'th position
+#define c1 x7
+// Outer loop counter
+#define i x8
+// Home for i'th digit or Montgomery multiplier
+#define d x9
+// Inner loop counter
+#define j x10
+#define h x11
+#define e x12
+#define l x13
+#define a x14
+
+// This is just a short-term temporary used in zero-test subtraction.
+// It's aliased to the same register as "a" which is always safe here.
+
+#define t x14
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+// These just use c0 and c1 again, which aren't initialized early on.
+
+#define one x6
+#define e1 x6
+#define e2 x7
+#define e4 x6
+#define e8 x7
+
+
+S2N_BN_SYMBOL(bignum_amontmul):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_amontmul_end
+
+// Compute word-level negated modular inverse w for m == m[0].
+// This is essentially the same as word_negmodinv.
+
+        ldr     a, [m]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     one, #1
+        madd    e1, a, w, one
+        mul     e2, e1, e1
+        madd    w, e1, w, w
+        mul     e4, e2, e2
+        madd    w, e2, w, w
+        mul     e8, e4, e4
+        madd    w, e4, w, w
+        madd    w, e8, w, w
+
+// Initialize the output c0::z to zero so we can then consistently add rows.
+// It would be a bit more efficient to special-case the zeroth row, but
+// this keeps the code slightly simpler.
+
+        mov     i, xzr
+bignum_amontmul_zoop:
+        str     xzr, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_amontmul_zoop
+        mov     c0, xzr
+
+// Outer loop pulling down digits d=x[i], multiplying by y and reducing
+
+        mov     i, xzr
+bignum_amontmul_outerloop:
+
+// Multiply-add loop where we always have CF + previous high part h to add in
+// Note that in general we do need yet one more carry in this phase and hence
+// initialize c1 with the top carry.
+
+        ldr     d, [x, i, lsl #3]
+        mov     j, xzr
+        adds    h, xzr, xzr
+bignum_amontmul_maddloop:
+        ldr     a, [y, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        str     e, [z, j, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_amontmul_maddloop
+        adcs    c0, c0, h
+        adc     c1, xzr, xzr
+
+// Montgomery reduction loop, similar but offsetting writebacks
+
+        ldr     e, [z]
+        mul     d, e, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   h, d, a
+        adds    e, e, l         // Will be zero but want the carry
+        mov     j, #1
+        sub     t, k, #1
+        cbz     t, bignum_amontmul_montend
+bignum_amontmul_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        sub     l, j, #1
+        str     e, [z, l, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_amontmul_montloop
+bignum_amontmul_montend:
+        adcs    h, c0, h
+        adc     c0, c1, xzr
+        sub     l, j, #1
+        str     h, [z, l, lsl #3]
+
+// End of outer loop
+
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_amontmul_outerloop
+
+// Now convert carry word, which is always in {0,1}, into a mask
+// and do a masked subtraction of m for the final almost-Montgomery result.
+
+        neg     c0, c0
+        subs    j, xzr, xzr
+bignum_amontmul_corrloop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        and     e, e, c0
+        sbcs    a, a, e
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_amontmul_corrloop
+
+bignum_amontmul_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontredc.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontredc.S
new file mode 100644
index 00000000000..79fef8e5be5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontredc.S
@@ -0,0 +1,176 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m)
+// Inputs x[n], m[k], p; output z[k]
+//
+//    extern void bignum_amontredc
+//     (uint64_t k, uint64_t *z,
+//      uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+//
+// Does a :== (x' / 2^{64p}) mod m where x' = x if n <= p + k and in general
+// is the lowest (p+k) digits of x. That is, p-fold almost-Montgomery reduction
+// w.r.t. a k-digit modulus m giving a k-digit answer.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = m, X5 = p
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontredc)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontredc)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define n x2
+#define x x3
+#define m x4
+#define p x5
+
+// Negated modular inverse
+#define w x6
+// Outer loop counter
+#define i x7
+// Inner loop counter
+#define j x8
+// Home for Montgomery multiplier
+#define d x9
+// Top carry for current window
+#define c x14
+
+#define h x10
+#define e x11
+#define l x12
+#define a x13
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+// These just use i and j again, which aren't used early on.
+
+#define one x7
+#define e1 x7
+#define e2 x8
+#define e4 x7
+#define e8 x8
+
+
+S2N_BN_SYMBOL(bignum_amontredc):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_amontredc_end
+
+// Compute word-level negated modular inverse w for m == m[0].
+// This is essentially the same as word_negmodinv.
+
+        ldr     a, [m]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     one, #1
+        madd    e1, a, w, one
+        mul     e2, e1, e1
+        madd    w, e1, w, w
+        mul     e4, e2, e2
+        madd    w, e2, w, w
+        mul     e8, e4, e4
+        madd    w, e4, w, w
+        madd    w, e8, w, w
+
+// Initialize z to the lowest k digits of the input, zero-padding if n < k.
+
+        cmp     n, k
+        csel    j, k, n, cs
+        mov     i, xzr
+        cbz     j, bignum_amontredc_padloop
+bignum_amontredc_copyloop:
+        ldr     a, [x, i, lsl #3]
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, j
+        bcc     bignum_amontredc_copyloop
+
+        cmp     i, k
+        bcs     bignum_amontredc_initialized
+
+bignum_amontredc_padloop:
+        str     xzr, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_amontredc_padloop
+
+bignum_amontredc_initialized:
+        mov     c, xzr
+
+// Now if p = 0 that's the end of the operation
+
+        cbz     p, bignum_amontredc_end
+
+// Outer loop, just doing a standard Montgomery reduction on z
+
+        mov     i, xzr
+bignum_amontredc_outerloop:
+
+        ldr     e, [z]
+        mul     d, e, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   h, d, a
+        adds    e, e, l         // Will be zero but want the carry
+        mov     j, #1
+        sub     a, k, #1
+        cbz     a, bignum_amontredc_montend
+bignum_amontredc_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        sub     l, j, #1
+        str     e, [z, l, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontredc_montloop
+bignum_amontredc_montend:
+        adcs    h, h, c
+        adc     c, xzr, xzr
+        add     j, j, i
+        cmp     j, n
+        bcs     bignum_amontredc_offtheend
+        ldr     a, [x, j, lsl #3]
+        adds    h, h, a
+        adc     c, c, xzr
+bignum_amontredc_offtheend:
+        sub     j, k, #1
+        str     h, [z, j, lsl #3]
+
+// End of outer loop
+
+        add     i, i, #1
+        cmp     i, p
+        bcc     bignum_amontredc_outerloop
+
+// Now convert carry word, which is always in {0,1}, into a mask
+// and do a masked subtraction of m for the final almost-Montgomery result.
+
+        neg     c, c
+        subs    j, xzr, xzr
+bignum_amontredc_corrloop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        and     e, e, c
+        sbcs    a, a, e
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_amontredc_corrloop
+
+bignum_amontredc_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontsqr.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontsqr.S
new file mode 100644
index 00000000000..d927e137d84
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontsqr.S
@@ -0,0 +1,180 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m)
+// Inputs x[k], m[k]; output z[k]
+//
+//    extern void bignum_amontsqr
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+//
+// Does z :== (x^2 / 2^{64k}) mod m, meaning that the result, in the native
+// size k, is congruent modulo m, but might not be fully reduced mod m. This
+// is why it is called *almost* Montgomery squaring.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontsqr)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontsqr)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define m x3
+
+// Negated modular inverse
+#define w x4
+// Top carry for k'th position
+#define c0 x5
+// Additional top carry for (k+1)'th position
+#define c1 x6
+// Outer loop counter
+#define i x7
+// Home for i'th digit or Montgomery multiplier
+#define d x8
+// Inner loop counter
+#define j x9
+
+#define h x10
+#define e x11
+#define l x12
+#define a x13
+
+// This is just a short-term temporary used in zero-test subtraction.
+// It's aliased to the same register as "a" which is always safe here.
+
+#define t x13
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+// These just use c0 and c1 again, which aren't initialized early on.
+
+#define one x5
+#define e1 x5
+#define e2 x6
+#define e4 x5
+#define e8 x6
+
+
+S2N_BN_SYMBOL(bignum_amontsqr):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_amontsqr_end
+
+// Compute word-level negated modular inverse w for m == m[0].
+// This is essentially the same as word_negmodinv.
+
+        ldr     a, [m]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     one, #1
+        madd    e1, a, w, one
+        mul     e2, e1, e1
+        madd    w, e1, w, w
+        mul     e4, e2, e2
+        madd    w, e2, w, w
+        mul     e8, e4, e4
+        madd    w, e4, w, w
+        madd    w, e8, w, w
+
+// Initialize the output c0::z to zero so we can then consistently add rows.
+// It would be a bit more efficient to special-case the zeroth row, but
+// this keeps the code slightly simpler.
+
+        mov     i, xzr
+bignum_amontsqr_zoop:
+        str     xzr, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_amontsqr_zoop
+        mov     c0, xzr
+
+// Outer loop pulling down digits d=x[i], multiplying by x and reducing
+
+        mov     i, xzr
+bignum_amontsqr_outerloop:
+
+// Multiply-add loop where we always have CF + previous high part h to add in
+// Note that in general we do need yet one more carry in this phase and hence
+// initialize c1 with the top carry.
+
+        ldr     d, [x, i, lsl #3]
+        mov     j, xzr
+        adds    h, xzr, xzr
+bignum_amontsqr_maddloop:
+        ldr     a, [x, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        str     e, [z, j, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_amontsqr_maddloop
+        adcs    c0, c0, h
+        adc     c1, xzr, xzr
+
+// Montgomery reduction loop, similar but offsetting writebacks
+
+        ldr     e, [z]
+        mul     d, e, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   h, d, a
+        adds    e, e, l         // Will be zero but want the carry
+        mov     j, #1
+        sub     t, k, #1
+        cbz     t, bignum_amontsqr_montend
+bignum_amontsqr_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        sub     l, j, #1
+        str     e, [z, l, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_amontsqr_montloop
+bignum_amontsqr_montend:
+        adcs    h, c0, h
+        adc     c0, c1, xzr
+        sub     l, j, #1
+        str     h, [z, l, lsl #3]
+
+// End of outer loop
+
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_amontsqr_outerloop
+
+// Now convert carry word, which is always in {0,1}, into a mask
+// and do a masked subtraction of m for the final almost-Montgomery result.
+
+        neg     c0, c0
+        subs    j, xzr, xzr
+bignum_amontsqr_corrloop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        and     e, e, c0
+        sbcs    a, a, e
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_amontsqr_corrloop
+
+bignum_amontsqr_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitfield.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitfield.S
new file mode 100644
index 00000000000..1630f0b0f62
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitfield.S
@@ -0,0 +1,93 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Select bitfield starting at bit n with length l <= 64
+// Inputs x[k], n, l; output function return
+//
+//    extern uint64_t bignum_bitfield
+//     (uint64_t k, uint64_t *x, uint64_t n, uint64_t l);
+//
+// One-word bitfield from a k-digit (digit=64 bits) bignum, in constant-time
+// style. Bitfield starts at bit n and has length l, indexing from 0 (=LSB).
+// Digits above the top are treated uniformly as zero, as usual. Since the
+// result is returned in a single word, effectively we use l' = min(64,l)
+// for the length.
+//
+// Standard ARM ABI: X0 = k, X1 = x, X2 = n, X3 = l, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bitfield)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bitfield)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define n x2
+#define l x3
+
+#define d x4
+#define e x5
+#define i x6
+#define a x7
+#define m x8
+
+
+S2N_BN_SYMBOL(bignum_bitfield):
+
+// For length zero finish immediately (the return value in x0 is 0)
+
+        cbz     k, bignum_bitfield_end
+
+// Decompose the index into n = 64 * n + m, then increment n for next part
+
+        and     m, n, #63
+        lsr     n, n, #6
+        add     n, n, #1
+
+// Run over the digits setting d = n'th and e = (n+1)'th
+
+        mov     i, xzr
+        mov     e, xzr
+bignum_bitfield_loop:
+        ldr     a, [x, i, lsl #3]
+        cmp     i, n
+        csel    d, a, d, cc
+        csel    e, a, e, eq
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_bitfield_loop
+
+// Override d with 0 if we ran off the end (e will retain original 0).
+
+        cmp     i, n
+        csel    d, xzr, d, cc
+
+// Override e if we have m = 0 (i.e. original n was divisible by 64)
+// This is because then we want to shift it right by 64 below.
+
+        cmp     m, xzr
+        csel    e, xzr, e, eq
+
+// Combine shifted digits to get the bitfield(n,64)
+
+        lsr     d, d, m
+        neg     m, m
+        lsl     e, e, m
+        orr     a, d, e
+
+// Now mask it down to get bitfield (n,l)
+
+        cmp     l, #64
+        cset    m, cc
+        lsl     m, m, l
+        sub     m, m, #1
+        and     x0, a, m
+bignum_bitfield_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitsize.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitsize.S
new file mode 100644
index 00000000000..e1a4a6dafde
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitsize.S
@@ -0,0 +1,67 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return size of bignum in bits
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is 0
+//
+// In principle this has a precondition k < 2^58, but obviously that
+// is always true in practice because of address space limitations.
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bitsize)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bitsize)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define i x2
+#define w x3
+#define a x4
+#define j x5
+
+
+S2N_BN_SYMBOL(bignum_bitsize):
+
+// If the bignum is zero-length, x0 is already the right answer of 0
+
+        cbz     k, bignum_bitsize_end
+
+// Use w = a[i-1] to store nonzero words in a bottom-up sweep
+// Set the initial default to be as if we had a 11...11 word directly below
+
+        mov     i, xzr
+        mov     w, #-1
+        mov     j, xzr
+bignum_bitsize_loop:
+        ldr     a, [x, j, lsl #3]
+        add     j, j, #1
+        cmp     a, #0
+        csel    i, j, i, ne
+        csel    w, a, w, ne
+        cmp     j, k
+        bne     bignum_bitsize_loop
+
+// Now w = a[i-1] is the highest nonzero word, or in the zero case the
+// default of the "extra" 11...11 = a[0-1]. We now want 64* i - clz(w).
+// Note that this code does not rely on the behavior of the clz instruction
+// for zero inputs, though the ARM manual does in fact guarantee clz(0) = 64.
+
+        lsl     i, i, #6
+        clz     a, w
+        sub     x0, i, a
+
+bignum_bitsize_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv.S
new file mode 100644
index 00000000000..7e6480c2a03
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv.S
@@ -0,0 +1,278 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Divide by a single (nonzero) word, z := x / m and return x mod m
+// Inputs x[n], m; outputs function return (remainder) and z[k]
+//
+//    extern uint64_t bignum_cdiv
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+//
+// Does the "z := x / m" operation where x is n digits, result z is k.
+// Truncates the quotient in general, but always (for nonzero m) returns
+// the true remainder x mod m.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = m, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cdiv)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cdiv)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define n x2
+#define x x3
+#define m x4
+
+// Main variables
+
+#define w x5
+#define i x6
+#define a x7
+#define c x8
+#define d x9
+#define e x10
+#define f x11
+#define l x12
+
+// These two are the same
+
+#define h x13
+#define q x13
+
+// Variables for the negmodinv
+
+#define one x6
+#define e1 x6
+#define e2 x7
+#define e4 x6
+#define e8 x7
+
+// Variable to hold the remainder
+
+#define r x14
+
+S2N_BN_SYMBOL(bignum_cdiv):
+
+// Effectively the same dataflow as bignum_cmod, with some basic
+// variable changes (using n for the size not k, returning r, etc.)
+// and using the i counter instead of modifying the size as a loop
+// counter.
+
+        mov     r, xzr
+        cbz     n, bignum_cdiv_nomodulus
+
+        clz     e, m
+        lsl     f, m, e
+
+        lsr     a, f, #16
+        eor     w, a, #0x1ffffffffffff
+        add     a, a, #0x1
+        lsr     w, w, #32
+        mneg    r, a, w
+        lsr     d, r, #49
+        mul     d, d, d
+        lsr     r, r, #34
+        add     r, d, r
+        orr     d, d, #0x40000000
+        mul     d, r, d
+        lsr     d, d, #30
+        lsl     r, w, #30
+        madd    w, w, d, r
+        lsr     w, w, #30
+        mneg    r, a, w
+        lsr     r, r, #24
+        mul     r, r, w
+        lsl     w, w, #16
+        lsr     r, r, #24
+        add     w, w, r
+        mneg    r, a, w
+        lsr     r, r, #32
+        mul     r, r, w
+        lsl     w, w, #31
+        lsr     r, r, #17
+        add     w, w, r
+        mul     d, f, w
+        umulh   r, f, w
+        extr    d, r, d, #60
+        lsr     r, w, #33
+        mvn     d, d
+        mul     d, r, d
+        lsl     w, w, #1
+        lsr     d, d, #33
+        add     w, w, d
+        adds    d, w, #0x1
+        cinv    d, d, eq
+        umulh   r, f, d
+        adds    xzr, r, f
+        csel    w, w, d, cs
+
+        mneg    r, w, f
+
+        mov     h, xzr
+        mov     l, xzr
+        mov     i, n
+bignum_cdiv_modloop:
+        sub     i, i, #1
+        ldr     d, [x, i, lsl #3]
+        mul     a, r, h
+        umulh   h, r, h
+        adds    a, a, d
+        adcs    h, h, l
+        csel    l, r, xzr, cs
+        adds    l, l, a
+        adc     h, h, xzr
+        cbnz    i, bignum_cdiv_modloop
+
+        umulh   c, w, h
+        adds    c, c, h
+        csel    r, f, xzr, cs
+
+        mul     a, c, f
+        umulh   d, c, f
+        add     d, d, r
+        subs    l, l, a
+        sbcs    h, h, d
+
+        csel    a, f, xzr, ne
+        subs    l, l, a
+        sbcs    h, h, xzr
+
+        csel    a, f, xzr, ne
+        sub     l, l, a
+
+        umulh   c, w, l
+        adds    c, c, l
+        cset    r, cs
+        extr    c, r, c, #1
+
+        eor     e, e, #63
+        lsr     c, c, e
+
+        mul     a, c, m
+        sub     l, l, a
+
+        subs    r, l, m
+        csel    r, r, l, cs
+
+bignum_cdiv_nomodulus:
+
+// If k = 0 then there's no more to be done
+
+        cbz     k, bignum_cdiv_end
+
+// Let e be the number of trailing zeros in m. This implementation uses
+// 63 - clz(-m & m) which is a bit slicker than the main word_ctz function
+// but fails for m = 0. We don't have to worry about that case here.
+
+        neg     e, m
+        and     e, e, m
+        clz     e, e
+        eor     e, e, #63
+
+// Also generate a corresponding bitmask f for selecting bottom 64 - e bits.
+
+        mov     f, #-1
+        lsr     f, f, e
+
+// Now just shift m right by e bits. So hereafter we can assume m is odd
+// but we first need to shift the input right by e bits then divide by m.
+
+        lsr     m, m, e
+
+// Compute the negated modular inverse w with w * m + 1 == 0 (mod 2^64)
+// This is essentially the same as word_negmodinv.
+
+        sub     w, m, m, lsl #2
+        eor     w, w, #2
+        mov     one, #1
+        madd    e1, m, w, one
+        mul     e2, e1, e1
+        madd    w, e1, w, w
+        mul     e4, e2, e2
+        madd    w, e2, w, w
+        mul     e8, e4, e4
+        madd    w, e4, w, w
+        madd    w, e8, w, w
+
+// We have the remainder r, so now x = m * y + r for some quotient y
+// to be computed. Consider x' = x + (m - r) = m * (y + 1) and do a
+// Montgomery reduction, keeping the cofactor z. This gives us
+// x' + m * z = 2^{64k} * c where c <= m. Thus since x' = m * (y + 1)
+// we have
+//
+//     m * (y + z + 1) = 2^{64k} * c
+//
+// This means m * (y + z + 1) == 0 (mod 2^{64k}), even when we truncate
+// x to k digits (if in fact k < n). Since m is odd, it's coprime to
+// 2^{64k} so we can cancel and get y + z + 1 == 0 (mod 2^{64k}), and
+// hence using logical complement y == ~z (mod 2^{64k}). Thus we can
+// write back the logical complements of the cofactor as the answer.
+// Start with carry word c = m - r/2^e to make the initial tweak
+// x' = x + (m - r); since we've shifted everything initially by e
+// we need to shift the remainder too before subtracting from the
+// shifted m.
+
+        lsr     c, r, e
+        sub     c, m, c
+        mov     i, xzr
+
+// Unless n = 0, preload the zeroth digit shifted right e places and bump
+// up the x pointer by 8 and n down by 1, to ease indexing and comparison
+// using the same variable i in the main loop. When n = 0 we leave it alone,
+// as the comparison i < n will always fail and the x pointer is unused.
+
+        mov     d, xzr
+        cbz     n, bignum_cdiv_loop
+        ldr     d, [x], #8
+        lsr     d, d, e
+        sub     n, n, 1
+
+bignum_cdiv_loop:
+
+// Load the next digit up to get [l,d] then shift right e places,
+// eventually setting d back to the other part of the newly loaded digit
+// ready for the next time round the loop.
+
+        mov     l, xzr
+        cmp     i, n
+        bcs     bignum_cdiv_noload
+        ldr     l, [x, i, lsl #3]
+bignum_cdiv_noload:
+        rorv    l, l, e
+        bic     a, l, f
+        orr     a, d, a
+        and     d, l, f
+
+// Now a is the next digit after shifting right by e places, c the carry-in.
+// Do the main Montgomery step with the (odd) m, writing back ~q.
+
+        adds    a, a, c
+        mul     q, a, w
+        cset    c, cs
+        mvn     l, q
+        str     l, [z, i, lsl #3]
+
+        mul     l, q, m
+        umulh   h, q, m
+
+        adds    l, l, a
+        adc     c, h, c
+
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_cdiv_loop
+
+// And return the remainder
+
+bignum_cdiv_end:
+
+        mov     x0, r
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv_exact.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv_exact.S
new file mode 100644
index 00000000000..cdf0eff0ff4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv_exact.S
@@ -0,0 +1,162 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Divide by a single word, z := x / m *when known to be exact*
+// Inputs x[n], m; output z[k]
+//
+//    extern void bignum_cdiv_exact
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+//
+// Does the "z := x / m" operation where x is n digits and result z is k,
+// *assuming* that m is nonzero and that the input x is in fact an
+// exact multiple of m. (If this isn't known, use the general bignum_cdiv
+// function instead.) In general the result is truncated to k digits.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cdiv_exact)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cdiv_exact)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define n x2
+#define x x3
+#define m x4
+
+// Main variables
+
+#define w x5
+#define i x6
+#define a x7
+#define c x8
+#define d x9
+#define e x10
+#define f x11
+#define l x12
+
+// These two are the same
+
+#define h x13
+#define q x13
+
+// Variables for the negmodinv
+
+#define one x6
+#define e1 x6
+#define e2 x7
+#define e4 x6
+#define e8 x7
+
+S2N_BN_SYMBOL(bignum_cdiv_exact):
+
+// If k = 0 then there's nothing to be done
+
+        cbz     k, bignum_cdiv_exact_end
+
+// Let e be the number of trailing zeros in m. This implementation uses
+// 63 - clz(-m & m) which is a bit slicker than the main word_ctz function
+// but fails for m = 0. We don't have to worry about that case here.
+
+        neg     e, m
+        and     e, e, m
+        clz     e, e
+        eor     e, e, #63
+
+// Also generate a corresponding bitmask f for selecting bottom 64 - e bits.
+
+        mov     f, #-1
+        lsr     f, f, e
+
+// Now just shift m right by e bits. So hereafter we can assume m is odd
+// but we first need to shift the input right by e bits then divide by m.
+
+        lsr     m, m, e
+
+// Compute the negated modular inverse w with w * m + 1 == 0 (mod 2^64)
+// This is essentially the same as word_negmodinv.
+
+        sub     w, m, m, lsl #2
+        eor     w, w, #2
+        mov     one, #1
+        madd    e1, m, w, one
+        mul     e2, e1, e1
+        madd    w, e1, w, w
+        mul     e4, e2, e2
+        madd    w, e2, w, w
+        mul     e8, e4, e4
+        madd    w, e4, w, w
+        madd    w, e8, w, w
+
+// Consider x' = x + m and do a Montgomery reduction, keeping the cofactor z.
+// This gives us x' + m * z = 2^{64k} * c where c <= m. Assuming x = m * y
+// we then have m * y + m + m * z = 2^{64k} * c, i.e.
+//
+//     m * (y + z + 1) = 2^{64k} * c
+//
+// This means m * (y + z + 1) == 0 (mod 2^{64k}), even when we truncate
+// x to k digits (if in fact k < n). Since m is odd, it's coprime to
+// 2^{64k} so we can cancel and get y + z + 1 == 0 (mod 2^{64k}), and
+// hence using logical complement y == ~z (mod 2^{64k}). Thus we can
+// write back the logical complements of the cofactor as the answer.
+// Start with carry word c = m to make the initial tweak x' = x + m.
+
+        mov     c, m
+        mov     i, xzr
+
+// Unless n = 0, preload the zeroth digit shifted right e places and bump
+// up the x pointer by 8 and n down by 1, to ease indexing and comparison
+// using the same variable i in the main loop. When n = 0 we leave it alone,
+// as the comparison i < n will always fail and the x pointer is unused.
+
+        mov     d, xzr
+        cbz     n, bignum_cdiv_exact_loop
+        ldr     d, [x], #8
+        lsr     d, d, e
+        sub     n, n, 1
+
+bignum_cdiv_exact_loop:
+
+// Load the next digit up to get [l,d] then shift right e places,
+// eventually setting d back to the other part of the newly loaded digit
+// ready for the next time round the loop.
+
+        mov     l, xzr
+        cmp     i, n
+        bcs     bignum_cdiv_exact_noload
+        ldr     l, [x, i, lsl #3]
+bignum_cdiv_exact_noload:
+        rorv    l, l, e
+        bic     a, l, f
+        orr     a, d, a
+        and     d, l, f
+
+// Now a is the next digit after shifting right by e places, c the carry-in.
+// Do the main Montgomery step with the (odd) m, writing back ~q.
+
+        adds    a, a, c
+        mul     q, a, w
+        cset    c, cs
+        mvn     l, q
+        str     l, [z, i, lsl #3]
+
+        mul     l, q, m
+        umulh   h, q, m
+
+        adds    l, l, a
+        adc     c, h, c
+
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_cdiv_exact_loop
+
+bignum_cdiv_exact_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cld.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cld.S
new file mode 100644
index 00000000000..3952abbc024
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cld.S
@@ -0,0 +1,52 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count leading zero digits (64-bit words)
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_cld (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is k
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cld)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cld)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define i x2
+#define a x3
+#define j x4
+
+
+S2N_BN_SYMBOL(bignum_cld):
+
+// If the bignum is zero-length, x0 is already the right answer of k = 0
+
+        cbz     k, bignum_cld_end
+
+// Run over the words j = 0..i-1, and set i := j + 1 when hitting nonzero a[j]
+
+        mov     i, xzr
+        mov     j, xzr
+bignum_cld_loop:
+        ldr     a, [x, j, lsl #3]
+        add     j, j, #1
+        cmp     a, #0
+        csel    i, j, i, ne
+        cmp     j, k
+        bne     bignum_cld_loop
+
+        sub     x0, x0, i
+bignum_cld_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_clz.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_clz.S
new file mode 100644
index 00000000000..48c6da3f761
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_clz.S
@@ -0,0 +1,68 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count leading zero bits
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_clz (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is 64 * k
+//
+// In principle this has a precondition k < 2^58, but obviously that
+// is always true in practice because of address space limitations
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_clz)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_clz)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define i x2
+#define w x3
+#define a x4
+#define j x5
+
+
+S2N_BN_SYMBOL(bignum_clz):
+
+// If the bignum is zero-length, x0 is already the right answer of 0
+
+        cbz     k, bignum_clz_end
+
+// Use w = a[i-1] to store nonzero words in a bottom-up sweep
+// Set the initial default to be as if we had a 11...11 word directly below
+
+        mov     i, xzr
+        mov     w, #-1
+        mov     j, xzr
+bignum_clz_loop:
+        ldr     a, [x, j, lsl #3]
+        add     j, j, #1
+        cmp     a, #0
+        csel    i, j, i, ne
+        csel    w, a, w, ne
+        cmp     j, k
+        bne     bignum_clz_loop
+
+// Now w = a[i-1] is the highest nonzero word, or in the zero case the
+// default of the "extra" 11...11 = a[0-1]. We now want 64*(k - i) + clz(w).
+// Note that this code does not rely on the behavior of the clz instruction
+// for zero inputs, though the ARM manual does in fact guarantee clz(0) = 64.
+
+        sub     k, k, i
+        lsl     k, k, #6
+        clz     a, w
+        add     x0, k, a
+
+bignum_clz_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmadd.S
new file mode 100644
index 00000000000..6211707f5e9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmadd.S
@@ -0,0 +1,113 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply-add with single-word multiplier, z := z + c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_cmadd
+//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := z + c * y" operation where y is n digits, result z is p.
+// Truncates the result in general.
+//
+// The return value is a high/carry word that is meaningful when p = n + 1, or
+// more generally when n <= p and the result fits in p + 1 digits. In these
+// cases it gives the top digit of the (p + 1)-digit result.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = c, X3 = n, X4 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd)
+        .text
+        .balign 4
+
+#define p x0
+#define z x1
+#define c x2
+#define n x3
+#define x x4
+
+#define i x5
+#define h x6
+#define l x7
+#define a x8
+
+#define b x9
+
+
+S2N_BN_SYMBOL(bignum_cmadd):
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output.
+// Subtract p := p - min(n,p) so it holds the size of the extra tail needed
+
+        cmp     n, p
+        csel    n, p, n, cs
+        sub     p, p, n
+
+// Initialize high part h = 0; if n = 0 do nothing but return that zero
+
+        adds    h, xzr, xzr
+        cbz     n, bignum_cmadd_end
+
+// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0
+
+        ldr     a, [x]
+        mul     l, c, a
+        umulh   h, c, a
+        ldr     b, [z]
+        adds    b, b, l
+        str     b, [z]
+        mov     i, #8
+        sub     n, n, #1
+        cbz     n, bignum_cmadd_tail
+
+// Main loop, where we always have CF + previous high part h to add in
+
+bignum_cmadd_loop:
+        ldr     a, [x, i]
+        ldr     b, [z, i]
+        mul     l, c, a
+        adcs    b, b, h
+        umulh   h, c, a
+        adc     h, h, xzr
+        adds    b, b, l
+        str     b, [z, i]
+        add     i, i, #8
+        sub     n, n, #1
+        cbnz    n, bignum_cmadd_loop
+
+// Propagate the carry all the way to the end with h as extra carry word
+
+bignum_cmadd_tail:
+        cbz     p, bignum_cmadd_end
+        ldr     b, [z, i]
+        adcs    b, b, h
+        str     b, [z, i]
+        mov     h, xzr
+        sub     p, p, #1
+        cbz     p, bignum_cmadd_end
+
+bignum_cmadd_tloop:
+        add     i, i, #8
+        ldr     b, [z, i]
+        adcs    b, b, xzr
+        str     b, [z, i]
+        sub     p, p, #1
+        cbnz    p, bignum_cmadd_tloop
+
+// Return the high/carry word. This gives the top word of the result provided
+// n <= p and the result fits in p + 1 digits. More generally, indeed, the
+// 2^64 * CF + return = the top part of the result whenever n <= p, though this
+// is not very exploitable from a C call.
+
+bignum_cmadd_end:
+        adcs    x0, h, xzr
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmnegadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmnegadd.S
new file mode 100644
index 00000000000..549fba7c9cc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmnegadd.S
@@ -0,0 +1,127 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negated multiply-add with single-word multiplier, z := z - c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_cmnegadd
+//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := z - c * y" operation where y is n digits, result z is p.
+// Truncates the result in general.
+//
+// The return value is a high/carry word that is meaningful when n <= p.
+// It is interpreted negatively as z' - 2^{64k} * return = z - c * y.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = c, X3 = n, X4 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmnegadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmnegadd)
+        .text
+        .balign 4
+
+#define p x0
+#define z x1
+#define c x2
+#define n x3
+#define x x4
+
+#define i x5
+#define h x6
+#define l x7
+#define a x8
+
+#define b x9
+
+
+S2N_BN_SYMBOL(bignum_cmnegadd):
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output.
+// Subtract p := p - min(n,p) so it holds the size of the extra tail needed
+
+        cmp     n, p
+        csel    n, p, n, cs
+        sub     p, p, n
+
+// Initialize high part h = 0; if n = 0 do nothing but return that zero
+
+        mov     h, xzr
+        cbz     n, bignum_cmnegadd_end
+
+// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * ~x_0 + c
+
+        ldr     a, [x]
+        mvn     a, a
+        mul     l, c, a
+        umulh   h, c, a
+        adds    l, l, c
+        adc     h, h, xzr
+        ldr     b, [z]
+        adds    b, b, l
+        str     b, [z]
+        mov     i, #8
+        sub     n, n, #1
+        cbz     n, bignum_cmnegadd_tail
+
+// Main loop, where we always have CF + previous high part h to add in
+
+bignum_cmnegadd_loop:
+        ldr     a, [x, i]
+        ldr     b, [z, i]
+        mvn     a, a
+        mul     l, c, a
+        adcs    b, b, h
+        umulh   h, c, a
+        adc     h, h, xzr
+        adds    b, b, l
+        str     b, [z, i]
+        add     i, i, #8
+        sub     n, n, #1
+        cbnz    n, bignum_cmnegadd_loop
+
+// At this point we have 2^{64n} * (h + CF) + z' = z + c * (2^{64n} - x)
+// so z' - 2^{64n} * (c - (h + CF)) = z - c * x.
+// Since z - c * x < 2^{64n} we must have c - (h + CF) >= 0.
+// Accumulate the negative carry in h for consistency with trivial cases.
+
+bignum_cmnegadd_tail:
+        adc     h, h, xzr
+        sub     h, c, h
+
+// Propagate the carry all the way to the end with h as extra carry word
+
+        cbz     p, bignum_cmnegadd_end
+        ldr     b, [z, i]
+        subs    b, b, h
+        str     b, [z, i]
+        mov     h, xzr
+        sub     p, p, #1
+        cbz     p, bignum_cmnegadd_highend
+
+bignum_cmnegadd_tloop:
+        add     i, i, #8
+        ldr     b, [z, i]
+        sbcs    b, b, xzr
+        str     b, [z, i]
+        sub     p, p, #1
+        cbnz    p, bignum_cmnegadd_tloop
+
+// Adjust the high word with the inverted carry h := h + (1 - CF)
+
+bignum_cmnegadd_highend:
+        cset    x0, cc
+        add     h, h, x0
+
+// Now copy h into the function return
+
+bignum_cmnegadd_end:
+        mov     x0, h
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmod.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmod.S
new file mode 100644
index 00000000000..baf57e8c4d5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmod.S
@@ -0,0 +1,179 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Find bignum modulo a single word
+// Input x[k], m; output function return
+//
+//    extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m);
+//
+// Returns x mod m, assuming m is nonzero.
+//
+// Standard ARM ABI: X0 = k, X1 = x, X2 = m, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmod)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmod)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define m x2
+
+#define e x3
+#define n x4
+#define w x5
+
+#define r x6
+#define h x7
+#define l x8
+#define a x9
+#define d x10
+
+// We re-use the k argument for a quotient estimate when it is no longer
+// needed for traversal (x0 is modified for the return value anyway).
+
+#define q x0
+
+S2N_BN_SYMBOL(bignum_cmod):
+
+// If the bignum is zero-length, x0 is already the right answer of 0
+
+        cbz     k, bignum_cmod_end
+
+// Find number of leading zeros of m and let n = 2^e m so that for an
+// in-scope (nonzero) input m we have n >= 2^63, e <= 63.
+
+        clz     e, m
+        lsl     n, m, e
+
+// A near-clone of word_recip so 2^64 + w = ceil(2^128 / n) - 1
+
+        lsr     a, n, #16
+        eor     w, a, #0x1ffffffffffff
+        add     a, a, #0x1
+        lsr     w, w, #32
+        mneg    r, a, w
+        lsr     d, r, #49
+        mul     d, d, d
+        lsr     r, r, #34
+        add     r, d, r
+        orr     d, d, #0x40000000
+        mul     d, r, d
+        lsr     d, d, #30
+        lsl     r, w, #30
+        madd    w, w, d, r
+        lsr     w, w, #30
+        mneg    r, a, w
+        lsr     r, r, #24
+        mul     r, r, w
+        lsl     w, w, #16
+        lsr     r, r, #24
+        add     w, w, r
+        mneg    r, a, w
+        lsr     r, r, #32
+        mul     r, r, w
+        lsl     w, w, #31
+        lsr     r, r, #17
+        add     w, w, r
+        mul     d, n, w
+        umulh   r, n, w
+        extr    d, r, d, #60
+        lsr     r, w, #33
+        mvn     d, d
+        mul     d, r, d
+        lsl     w, w, #1
+        lsr     d, d, #33
+        add     w, w, d
+        adds    d, w, #0x1
+        cinv    d, d, eq
+        umulh   r, n, d
+        adds    xzr, r, n
+        csel    w, w, d, cs
+
+// Take the residue r = 2^128 - (2^64 + w) * n, which by the above bound
+// we know fits in 64 bits. We know 2^128 == r (mod n) and hence (mod m).
+
+        mneg    r, w, n
+
+// Now just go down through the digits accumulating [h;l] == x (mod n)
+// by 2^64 * [h;l] + d = 2^128 * h + [l;d] == r * h + [l; d]. That addition
+// may overflow with a carry, say 2^128 + [h';l'] = r * h + [l; d], in
+// which case we subtract 2^128 - r (which is divisible by m and keeping
+// things in 128 bits we just add r). Thus the overall bound when we initially
+// overflow is r * h + [l; d] - (2^128 - r) = r * (h + 1) + [l; d] - 2^128
+// < 2^128 so we stay inside 2 words
+
+        mov     h, xzr
+        mov     l, xzr
+bignum_cmod_loop:
+        sub     k, k, #1
+        ldr     d, [x, k, lsl #3]
+        mul     a, r, h
+        umulh   h, r, h
+        adds    a, a, d
+        adcs    h, h, l
+        csel    l, r, xzr, cs
+        adds    l, l, a
+        adc     h, h, xzr
+        cbnz    k, bignum_cmod_loop
+
+// Now do reciprocal multiplication to reduce the 2-word modular equivalent
+// [h;l] to the single word l. If we assume the truncations are as follows
+//   2^64 + w = 2^128 / n - epsilon (0 <= epsilon <= 1)
+//   q = (w * h / 2^64) - delta (0 <= delta <= 1)
+// the net remainder is l + (h/2^64 * epsilon + delta) * n < l + 2 * n.
+// In general this needs two rounds of comparison to guarantee getting
+// into a single word (though one more mul could be used instead).
+// Also, the quotient estimate can overflow so we use r as extra addend
+// 2^64 * n when the initial addition overflows. The overall multiple
+// of n can't itself overflow, since we know it's an underestimate of
+// the initial residue.
+
+        umulh   q, w, h
+        adds    q, q, h
+        csel    r, n, xzr, cs
+
+        mul     a, q, n
+        umulh   d, q, n
+        add     d, d, r
+        subs    l, l, a
+        sbcs    h, h, d
+
+        csel    a, n, xzr, ne
+        subs    l, l, a
+        sbcs    h, h, xzr
+
+        csel    a, n, xzr, ne
+        sub     l, l, a
+
+// One more reciprocal multiplication to do a modular reduction, but now in
+// one word and in terms of the original m. For the quotient estimate we want
+// q = ((2^64 + w) * l) / 2^{128-e} = ((2^64 + w) * l) / 2^65 / 2^{63-e}.
+
+        umulh   q, w, l
+        adds    q, q, l
+        cset    r, cs
+        extr    q, r, q, #1
+
+        eor     e, e, #63
+        lsr     q, q, e
+
+        mul     a, q, m
+        sub     l, l, a
+
+// Note that since there is no neglected "low" part of the single word,
+// one round of correction suffices; in the analog of the above l = 0
+// and hence the residue so far is already < 2 * m.
+
+        subs    x0, l, m
+        csel    x0, x0, l, cs
+
+bignum_cmod_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmul.S
new file mode 100644
index 00000000000..c17cd621568
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmul.S
@@ -0,0 +1,104 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word, z := c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_cmul
+//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := c * y" operation where y is n digits, result z is p.
+// Truncates the result in general unless p >= n + 1.
+//
+// The return value is a high/carry word that is meaningful when p >= n as
+// giving the high part of the result. Since this is always zero if p > n,
+// it is mainly of interest in the special case p = n, i.e. where the source
+// and destination have the same nominal size, when it gives the extra word
+// of the full result.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = c, X3 = n, X4 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul)
+        .text
+        .balign 4
+
+#define p x0
+#define z x1
+#define c x2
+#define n x3
+#define x x4
+
+#define i x5
+#define h x6
+#define l x7
+#define a x8
+
+
+S2N_BN_SYMBOL(bignum_cmul):
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output.
+// Subtract p := p - min(n,p) so it holds the size of the extra tail needed
+
+        cmp     n, p
+        csel    n, p, n, cs
+        sub     p, p, n
+
+// Initialize current input/output pointer offset i and high part h.
+// But then if n = 0 skip the multiplication and go to the tail part
+
+        mov     h, xzr
+        mov     i, xzr
+        cbz     n, bignum_cmul_tail
+
+// Initialization of the loop: [h,l] = c * x_0
+
+        ldr     a, [x]
+        mul     l, c, a
+        umulh   h, c, a
+        str     l, [z]
+        add     i, i, #8
+        subs    n, n, #1
+        beq     bignum_cmul_tail
+
+// Main loop (force CF = 0 at the beginning)
+
+        adds    xzr, xzr, xzr
+bignum_cmul_loop:
+        ldr     a, [x, i]
+        mul     l, c, a
+        adcs    l, l, h
+        umulh   h, c, a
+        str     l, [z, i]
+        add     i, i, #8
+        sub     n, n, #1
+        cbnz    n, bignum_cmul_loop
+
+        adc     h, h, xzr
+
+bignum_cmul_tail:
+        cbz     p, bignum_cmul_end
+        str     h, [z, i]
+        mov     h, xzr
+        subs    p, p, #1
+        beq     bignum_cmul_end
+
+bignum_cmul_tloop:
+        add     i, i, #8
+        str     xzr, [z, i]
+        sub     p, p, #1
+        cbnz    p, bignum_cmul_tloop
+
+// Return the high/carry word
+
+bignum_cmul_end:
+        mov     x0, h
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_coprime.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_coprime.S
new file mode 100644
index 00000000000..3bed2cc8ae1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_coprime.S
@@ -0,0 +1,450 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignums for coprimality, gcd(x,y) = 1
+// Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)]
+//
+//    extern uint64_t bignum_coprime
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t);
+//
+// Test for whether two bignums are coprime (no common factor besides 1).
+// This is equivalent to testing if their gcd is 1, but a bit faster than
+// doing those two computations separately.
+//
+// Here bignum x is m digits long, y is n digits long and the temporary
+// buffer t needs to be 2 * max(m,n) digits long. The return value is
+// 1 if coprime(x,y) and 0 otherwise.
+//
+// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, X4 = t, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_coprime)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_coprime)
+        .text
+        .balign 4
+
+#define CHUNKSIZE 58
+
+// Pervasive variables
+
+#define k x9
+#define m x4
+#define n x5
+
+// Used via parameters in copy-in loop, then re-used as outer loop
+// counter t and adaptive precision digit size l, which becomes a
+// reduced version of k in later iterations but starts at l = k
+
+#define x x1
+#define y x3
+
+#define t x2
+#define l x3
+
+// The matrix of update factors to apply to m and n
+// Also used a couple of additional temporary variables for the swapping loop
+// Also used as an extra down-counter in corrective negation loops
+
+#define m_m x6
+#define m_n x7
+#define n_m x8
+#define n_n x1
+
+#define t3 x6
+#define t4 x7
+
+#define j x6
+
+// General temporary variables and loop counters
+
+#define i x10
+#define t1 x11
+#define t2 x12
+
+// High and low proxies for the inner loop
+// Then re-used for high and carry words during actual cross-multiplications
+
+#define m_hi x13
+#define n_hi x14
+#define m_lo x15
+#define n_lo x16
+
+#define h1 x13
+#define h2 x14
+#define l1 x15
+#define l2 x16
+
+#define c1 x17
+#define c2 x19
+#define tt x20
+
+
+S2N_BN_SYMBOL(bignum_coprime):
+
+// We make use of just a couple of additional registers
+
+        stp     x19, x20, [sp, #-16]!
+
+// Compute k = max(m,n), and if this is zero skip to the end. Note that
+// in this case x0 = m = 0 so we return the right answer of "false"
+
+        cmp     x0, x2
+        csel    k, x2, x0, cc
+        cbz     k, bignum_coprime_end
+
+// Set up inside w two size-k buffers m and n
+
+        lsl     i, k, #3
+        add     n, m, i
+
+// Copy the input x into the buffer m, padding with zeros as needed
+
+        mov     i, xzr
+        cbz     x0, bignum_coprime_xpadloop
+bignum_coprime_xloop:
+        ldr     t1, [x1, i, lsl #3]
+        str     t1, [m, i, lsl #3]
+        add     i, i, #1
+        cmp     i, x0
+        bcc     bignum_coprime_xloop
+        cmp     i, k
+        bcs     bignum_coprime_xskip
+bignum_coprime_xpadloop:
+        str     xzr, [m, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_coprime_xpadloop
+bignum_coprime_xskip:
+
+// Copy the input y into the buffer n, padding with zeros as needed
+
+        mov     i, xzr
+        cbz     x2, bignum_coprime_ypadloop
+bignum_coprime_yloop:
+        ldr     t1, [x3, i, lsl #3]
+        str     t1, [n, i, lsl #3]
+        add     i, i, #1
+        cmp     i, x2
+        bcc     bignum_coprime_yloop
+        cmp     i, k
+        bcs     bignum_coprime_yskip
+bignum_coprime_ypadloop:
+        str     xzr, [n, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_coprime_ypadloop
+bignum_coprime_yskip:
+
+// Set up the outer loop count of 64 * sum of input sizes.
+// The invariant is that m * n < 2^t at all times.
+
+        add     t, x0, x2
+        lsl     t, t, #6
+
+// Record for the very end the OR of the lowest words.
+// If the bottom bit is zero we know both are even so the answer is false.
+// But since this is constant-time code we still execute all the main part.
+
+        ldr     x0, [m]
+        ldr     t3, [n]
+        orr     x0, x0, t3
+
+// Now if n is even trigger a swap of m and n. This ensures that if
+// one or other of m and n is odd then we make sure now that n is,
+// as expected by our invariant later on.
+
+        and     t3, t3, #1
+        sub     t3, t3, #1
+
+        mov     i, xzr
+bignum_coprime_swaploop:
+        ldr     t1, [m, i, lsl #3]
+        ldr     t2, [n, i, lsl #3]
+        eor     t4, t1, t2
+        and     t4, t4, t3
+        eor     t1, t1, t4
+        eor     t2, t2, t4
+        str     t1, [m, i, lsl #3]
+        str     t2, [n, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_coprime_swaploop
+
+// Start of the main outer loop iterated t / CHUNKSIZE times
+
+bignum_coprime_outerloop:
+
+// We need only bother with sharper l = min k (ceil(t/64)) digits
+// Either both m and n fit in l digits, or m has become zero and so
+// nothing happens in the loop anyway and this makes no difference.
+
+        add     i, t, #63
+        lsr     l, i, #6
+        cmp     l, k
+        csel    l, k, l, cs
+
+// Select upper and lower proxies for both m and n to drive the inner
+// loop. The lower proxies are simply the lowest digits themselves,
+// m_lo = m[0] and n_lo = n[0], while the upper proxies are bitfields
+// of the two inputs selected so their top bit (63) aligns with the
+// most significant bit of *either* of the two inputs.
+
+        mov     h1, xzr // Previous high and low for m
+        mov     l1, xzr
+        mov     h2, xzr // Previous high and low for n
+        mov     l2, xzr
+        mov     c2, xzr // Mask flag: previous word of one was nonzero
+        // and in this case h1 and h2 are those words
+
+        mov     i, xzr
+bignum_coprime_toploop:
+        ldr     t1, [m, i, lsl #3]
+        ldr     t2, [n, i, lsl #3]
+        orr     c1, t1, t2
+        cmp     c1, xzr
+        and     c1, c2, h1
+        csel    l1, c1, l1, ne
+        and     c1, c2, h2
+        csel    l2, c1, l2, ne
+        csel    h1, t1, h1, ne
+        csel    h2, t2, h2, ne
+        csetm   c2, ne
+        add     i, i, #1
+        cmp     i, l
+        bcc     bignum_coprime_toploop
+
+        orr     t1, h1, h2
+        clz     t2, t1
+        negs    c1, t2
+        lsl     h1, h1, t2
+        csel    l1, l1, xzr, ne
+        lsl     h2, h2, t2
+        csel    l2, l2, xzr, ne
+        lsr     l1, l1, c1
+        lsr     l2, l2, c1
+        orr     m_hi, h1, l1
+        orr     n_hi, h2, l2
+
+        ldr     m_lo, [m]
+        ldr     n_lo, [n]
+
+// Now the inner loop, with i as loop counter from CHUNKSIZE down.
+// This records a matrix of updates to apply to the initial
+// values of m and n with, at stage j:
+//
+//     sgn * m' = (m_m * m - m_n * n) / 2^j
+//    -sgn * n' = (n_m * m - n_n * n) / 2^j
+//
+// where "sgn" is either +1 or -1, and we lose track of which except
+// that both instance above are the same. This throwing away the sign
+// costs nothing (since we have to correct in general anyway because
+// of the proxied comparison) and makes things a bit simpler. But it
+// is simply the parity of the number of times the first condition,
+// used as the swapping criterion, fires in this loop.
+
+        mov     m_m, #1
+        mov     m_n, xzr
+        mov     n_m, xzr
+        mov     n_n, #1
+
+        mov     i, #CHUNKSIZE
+
+// Conceptually in the inner loop we follow these steps:
+//
+// * If m_lo is odd and m_hi < n_hi, then swap the four pairs
+//    (m_hi,n_hi); (m_lo,n_lo); (m_m,n_m); (m_n,n_n)
+//
+// * Now, if m_lo is odd (old or new, doesn't matter as initial n_lo is odd)
+//    m_hi := m_hi - n_hi, m_lo := m_lo - n_lo
+//    m_m  := m_m + n_m, m_n := m_n + n_n
+//
+// * Halve and double them
+//     m_hi := m_hi / 2, m_lo := m_lo / 2
+//     n_m := n_m * 2, n_n := n_n * 2
+//
+// The actual computation computes updates before actually swapping and
+// then corrects as needed. It also maintains the invariant ~ZF <=> odd(m_lo),
+// since it seems to reduce the dependent latency. Set that up first.
+
+        ands    xzr, m_lo, #1
+
+bignum_coprime_innerloop:
+
+// At the start of the loop ~ZF <=> m_lo is odd; mask values accordingly
+// Set the flags for m_hi - [~ZF] * n_hi so we know to flip things.
+
+        csel    t1, n_hi, xzr, ne
+        csel    t2, n_lo, xzr, ne
+        csel    c1, n_m, xzr, ne
+        csel    c2, n_n, xzr, ne
+        ccmp    m_hi, n_hi, #0x2, ne
+
+// Compute subtractive updates, trivial in the case ZF <=> even(m_lo).
+
+        sub     t1, m_hi, t1
+        sub     t2, m_lo, t2
+
+// If the subtraction borrows, swap things appropriately, negating where
+// we've already subtracted so things are as if we actually swapped first.
+
+        csel    n_hi, n_hi, m_hi, cs
+        cneg    t1, t1, cc
+        csel    n_lo, n_lo, m_lo, cs
+        cneg    m_lo, t2, cc
+        csel    n_m, n_m, m_m, cs
+        csel    n_n, n_n, m_n, cs
+
+// Update and shift while setting oddness flag for next iteration
+// We look at bit 1 of t2 (m_lo before possible negation), which is
+// safe because it is even.
+
+        ands    xzr, t2, #2
+        add     m_m, m_m, c1
+        add     m_n, m_n, c2
+        lsr     m_hi, t1, #1
+        lsr     m_lo, m_lo, #1
+        add     n_m, n_m, n_m
+        add     n_n, n_n, n_n
+
+// Next iteration; don't disturb the flags since they are used at entry
+
+        sub     i, i, #1
+        cbnz    i, bignum_coprime_innerloop
+
+// Now actually compute the updates to m and n corresponding to that matrix,
+// and correct the signs if they have gone negative. First we compute the
+// (k+1)-sized updates
+//
+//    c1::h1::m = m_m * m - m_n * n
+//    c2::h2::n = n_m * m - n_n * n
+//
+// then for each one, sign-correct and shift by CHUNKSIZE
+
+        mov     h1, xzr
+        mov     h2, xzr
+        mov     c1, xzr
+        mov     c2, xzr
+        mov     i, xzr
+bignum_coprime_crossloop:
+        ldr     t1, [m, i, lsl #3]
+        ldr     t2, [n, i, lsl #3]
+
+        mul     l1, m_m, t1
+        mul     l2, m_n, t2
+        adds    l1, l1, h1
+        umulh   h1, m_m, t1
+        adc     h1, h1, xzr
+        umulh   tt, m_n, t2
+        sub     c1, tt, c1
+        subs    l1, l1, l2
+        str     l1, [m, i, lsl #3]
+        sbcs    h1, h1, c1
+        csetm   c1, cc
+
+        mul     l1, n_m, t1
+        mul     l2, n_n, t2
+        adds    l1, l1, h2
+        umulh   h2, n_m, t1
+        adc     h2, h2, xzr
+        umulh   tt, n_n, t2
+        sub     c2, tt, c2
+        subs    l1, l1, l2
+        str     l1, [n, i, lsl #3]
+        sbcs    h2, h2, c2
+        csetm   c2, cc
+
+        add     i, i, #1
+        cmp     i, l
+        bcc     bignum_coprime_crossloop
+
+// Write back m optionally negated and shifted right CHUNKSIZE bits
+
+        adds    xzr, c1, c1
+
+        ldr     l1, [m]
+        mov     i, xzr
+        sub     j, l, #1
+        cbz     j, bignum_coprime_negskip1
+
+bignum_coprime_negloop1:
+        add     t1, i, #8
+        ldr     t2, [m, t1]
+        extr    l1, t2, l1, #CHUNKSIZE
+        eor     l1, l1, c1
+        adcs    l1, l1, xzr
+        str     l1, [m, i]
+        mov     l1, t2
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_coprime_negloop1
+bignum_coprime_negskip1:
+        extr    l1, h1, l1, #CHUNKSIZE
+        eor     l1, l1, c1
+        adcs    l1, l1, xzr
+        str     l1, [m, i]
+
+// Write back n optionally negated and shifted right CHUNKSIZE bits
+
+        adds    xzr, c2, c2
+
+        ldr     l1, [n]
+        mov     i, xzr
+        sub     j, l, #1
+        cbz     j, bignum_coprime_negskip2
+bignum_coprime_negloop2:
+        add     t1, i, #8
+        ldr     t2, [n, t1]
+        extr    l1, t2, l1, #CHUNKSIZE
+        eor     l1, l1, c2
+        adcs    l1, l1, xzr
+        str     l1, [n, i]
+        mov     l1, t2
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_coprime_negloop2
+bignum_coprime_negskip2:
+        extr    l1, h2, l1, #CHUNKSIZE
+        eor     l1, l1, c2
+        adcs    l1, l1, xzr
+        str     l1, [n, i]
+
+// End of main loop. We can stop if t' <= 0 since then m * n < 2^0, which
+// since n is odd (in the main cases where we had one or other input odd)
+// means that m = 0 and n is the final gcd. Moreover we do in fact need to
+// maintain strictly t > 0 in the main loop, or the computation of the
+// optimized digit bound l could collapse to 0.
+
+        subs    t, t, #CHUNKSIZE
+        bhi     bignum_coprime_outerloop
+
+// Now compare n with 1 (OR of the XORs in t1)
+
+        ldr     t1, [n]
+        eor     t1, t1, #1
+        cmp     k, #1
+        beq     bignum_coprime_finalcomb
+        mov     i, #1
+bignum_coprime_compareloop:
+        ldr     t2, [n, i, lsl #3]
+        orr     t1, t1, t2
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_coprime_compareloop
+
+// Now combine that with original oddness flag, which is still in x0
+
+bignum_coprime_finalcomb:
+        cmp     t1, xzr
+        cset    t1, eq
+        and     x0, x0, t1
+
+bignum_coprime_end:
+        ldp     x19, x20, [sp], #16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy.S
new file mode 100644
index 00000000000..a4c0a7fd818
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy.S
@@ -0,0 +1,63 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Copy bignum with zero-extension or truncation, z := x
+// Input x[n]; output z[k]
+//
+//    extern void bignum_copy
+//      (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define n x2
+#define x x3
+
+#define i x4
+#define a x5
+
+
+S2N_BN_SYMBOL(bignum_copy):
+
+// Replace n with min(k,n) so we are definitely safe copying those
+// Initialize the element counter to 0
+
+        cmp     k, n
+        csel    n, k, n, cc
+        mov     i, #0
+
+// If min(k,n) = 0 jump to the padding stage
+
+        cbz     n, bignum_copy_padding
+
+bignum_copy_copyloop:
+        ldr     a, [x, i, lsl #3]
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, n
+        bcc     bignum_copy_copyloop
+
+bignum_copy_padding:
+        cmp     i, k
+        bcs     bignum_copy_end
+bignum_copy_padloop:
+        str     xzr, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_copy_padloop
+
+bignum_copy_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table.S
similarity index 100%
rename from third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table.S
diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_16.S
similarity index 92%
rename from third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_16.S
index ea0bef702ce..764fbbb8ffb 100644
--- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_16.S
@@ -8,7 +8,7 @@
 // achieved by reading the whole table and using the bit-masking to get the
 // `idx`-th row.
 //
-//    extern void bignum_copy_from_table_16_neon
+//    extern void bignum_copy_from_table_16
 //     (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx);
 //
 // Initial version written by Hanno Becker
@@ -16,8 +16,8 @@
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_16_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_16_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_16)
         .text
         .balign 4
 
@@ -57,7 +57,7 @@
 
 #define vmask    v17
 
-S2N_BN_SYMBOL(bignum_copy_row_from_table_16_neon):
+S2N_BN_SYMBOL(bignum_copy_row_from_table_16):
 
     // Clear accumulator
     // Zeroing can be done via xor, but xor isn't formalized yet.
@@ -71,7 +71,7 @@ S2N_BN_SYMBOL(bignum_copy_row_from_table_16_neon):
     mov ventry7.16b,  ventry0.16b
 
     mov cnt, #0
-bignum_copy_row_from_table_16_neon_loop:
+bignum_copy_row_from_table_16_loop:
 
     // Compute mask: Check if current index matches target index
     subs xzr, cnt, idx
@@ -106,9 +106,9 @@ bignum_copy_row_from_table_16_neon_loop:
 
     add cnt, cnt, #1
     subs xzr, height, cnt
-    b.ne bignum_copy_row_from_table_16_neon_loop
+    b.ne bignum_copy_row_from_table_16_loop
 
-bignum_copy_row_from_table_16_neon_end:
+bignum_copy_row_from_table_16_end:
 
     str qentry0,  [z, #16*0]
     str qentry1,  [z, #16*1]
diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_32.S
similarity index 94%
rename from third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_32.S
index c3dc386990f..02a1fdb9151 100644
--- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_32.S
@@ -8,7 +8,7 @@
 // achieved by reading the whole table and using the bit-masking to get the
 // `idx`-th row.
 //
-//    extern void bignum_copy_from_table_32_neon
+//    extern void bignum_copy_from_table_32
 //     (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx);
 //
 // Initial version written by Hanno Becker
@@ -16,8 +16,8 @@
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_32_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_32_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_32)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_32)
         .text
         .balign 4
 
@@ -72,7 +72,7 @@
 
 #define vmask    v17
 
-S2N_BN_SYMBOL(bignum_copy_row_from_table_32_neon):
+S2N_BN_SYMBOL(bignum_copy_row_from_table_32):
 
     // Clear accumulator
     // Zeroing can be done via xor, but xor isn't formalized yet.
@@ -94,7 +94,7 @@ S2N_BN_SYMBOL(bignum_copy_row_from_table_32_neon):
     mov ventry15.16b,  ventry0.16b
 
     mov cnt, #0
-bignum_copy_row_from_table_32_neon_loop:
+bignum_copy_row_from_table_32_loop:
 
     // Compute mask: Check if current index matches target index
     subs xzr, cnt, idx
@@ -153,9 +153,9 @@ bignum_copy_row_from_table_32_neon_loop:
 
     add cnt, cnt, #1
     subs xzr, height, cnt
-    b.ne bignum_copy_row_from_table_32_neon_loop
+    b.ne bignum_copy_row_from_table_32_loop
 
-bignum_copy_row_from_table_32_neon_end:
+bignum_copy_row_from_table_32_end:
 
     str qentry0,  [z, #16*0]
     str qentry1,  [z, #16*1]
diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_8n.S
similarity index 77%
rename from third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_8n.S
index b065a70525c..8c21b1b8488 100644
--- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_8n.S
@@ -8,15 +8,15 @@
 // achieved by reading the whole table and using the bit-masking to get the
 // `idx`-th row.
 //
-//    extern void bignum_copy_from_table_8_neon
+//    extern void bignum_copy_from_table_8n
 //     (uint64_t *z, uint64_t *table, uint64_t height, uint64_t width, uint64_t idx);
 //
 // Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = width, X4 = idx
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_8n_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_8n_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_8n)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_8n)
         .text
         .balign 4
 
@@ -33,27 +33,27 @@
 
 #define vmask v16
 
-S2N_BN_SYMBOL(bignum_copy_row_from_table_8n_neon):
+S2N_BN_SYMBOL(bignum_copy_row_from_table_8n):
 
-        cbz     height, bignum_copy_row_from_table_8n_neon_end
-        cbz     width,  bignum_copy_row_from_table_8n_neon_end
+        cbz     height, bignum_copy_row_from_table_8n_end
+        cbz     width,  bignum_copy_row_from_table_8n_end
         mov     i, width
         mov     x6, z
         dup     v16.2d, xzr
 
-bignum_copy_row_from_table_8n_neon_initzero:
+bignum_copy_row_from_table_8n_initzero:
         str     q16, [x6]
         str     q16, [x6, #16]
         str     q16, [x6, #32]
         str     q16, [x6, #48]
         add     x6, x6, #64
         subs    i, i, #8
-        bne     bignum_copy_row_from_table_8n_neon_initzero
+        bne     bignum_copy_row_from_table_8n_initzero
 
         mov     i, xzr
         mov     x8, table
 
-bignum_copy_row_from_table_8n_neon_outerloop:
+bignum_copy_row_from_table_8n_outerloop:
 
         cmp     i, idx
         csetm   mask, eq
@@ -62,7 +62,7 @@ bignum_copy_row_from_table_8n_neon_outerloop:
         mov     j, width
         mov     x9, z
 
-bignum_copy_row_from_table_8n_neon_innerloop:
+bignum_copy_row_from_table_8n_innerloop:
 
         ldr     q17, [x8]
         ldr     q18, [x9]
@@ -87,14 +87,14 @@ bignum_copy_row_from_table_8n_neon_innerloop:
         add     x8, x8, #64
         add     x9, x9, #64
         subs    j, j, #8
-        bne     bignum_copy_row_from_table_8n_neon_innerloop
+        bne     bignum_copy_row_from_table_8n_innerloop
 
-bignum_copy_row_from_table_8n_neon_innerloop_done:
+bignum_copy_row_from_table_8n_innerloop_done:
         add     i, i, #1
         cmp     i, height
-        bne     bignum_copy_row_from_table_8n_neon_outerloop
+        bne     bignum_copy_row_from_table_8n_outerloop
 
-bignum_copy_row_from_table_8n_neon_end:
+bignum_copy_row_from_table_8n_end:
         ret
 
 #if defined(__linux__) && defined(__ELF__)
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctd.S
new file mode 100644
index 00000000000..8a721fc1516
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctd.S
@@ -0,0 +1,53 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count trailing zero digits (64-bit words)
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_ctd (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is k
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ctd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ctd)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define i x2
+#define a x3
+
+
+S2N_BN_SYMBOL(bignum_ctd):
+
+// If the bignum is zero-length, x0 is already the right answer of 0
+
+        cbz     k, bignum_ctd_end
+
+// Record in i that the lowest nonzero word is i, where i = k means
+// that the bignum was entirely zero
+
+        mov     i, k
+bignum_ctd_loop:
+        sub     k, k, #1
+        ldr     a, [x, k, lsl #3]
+        cmp     a, #0
+        csel    i, k, i, ne
+        cbnz    k, bignum_ctd_loop
+
+// Now return i
+
+        mov     x0, i
+
+bignum_ctd_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctz.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctz.S
new file mode 100644
index 00000000000..8977925e0e1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctz.S
@@ -0,0 +1,73 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count trailing zero bits
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_ctz (uint64_t k, uint64_t *x);
+//
+//
+// In the case of a zero bignum as input the result is 64 * k
+//
+// In principle this has a precondition k < 2^58, but obviously that
+// is always true in practice because of address space limitations
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ctz)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ctz)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define i x2
+#define w x3
+#define a x4
+
+
+S2N_BN_SYMBOL(bignum_ctz):
+
+// If the bignum is zero-length, x0 is already the right answer of 0
+
+        cbz     k, bignum_ctz_end
+
+// Use w = a[i] to store nonzero words in a top-down sweep
+// Set the initial default to be as if we had a 1 word directly above
+
+        mov     i, k
+        mov     w, #1
+
+bignum_ctz_loop:
+        sub     k, k, #1
+        ldr     a, [x, k, lsl #3]
+        cmp     a, #0
+        csel    i, k, i, ne
+        csel    w, a, w, ne
+        cbnz    k, bignum_ctz_loop
+
+// Now w = a[i] is the lowest nonzero word, or in the zero case the
+// default of the "extra" 1 = a[k]. We now want 64*i + ctz(w).
+//
+// ARM doesn't have a direct word ctz instruction, so we emulate it via
+// ctz(w) = 64 - clz(~w & (w-1)). This is depending, for cases of the form
+// ctz(....1), on the behavior clz(0) = 64, which is guaranteed according
+// to the ARM manual.
+
+        mvn     a, w
+        sub     w, w, #1
+        add     i, i, #1
+        and     w, w, a
+        lsl     i, i, #6
+        clz     a, w
+        sub     x0, i, a
+
+bignum_ctz_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_demont.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_demont.S
new file mode 100644
index 00000000000..d93b5b0eacd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_demont.S
@@ -0,0 +1,156 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m
+// Inputs x[k], m[k]; output z[k]
+//
+//    extern void bignum_demont
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+//
+// Does z := (x / 2^{64k}) mod m, hence mapping out of Montgomery domain.
+// In other words, this is a k-fold Montgomery reduction with same-size input.
+// This can handle almost-Montgomery inputs, i.e. any k-digit bignum.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define m x3
+
+// Negated modular inverse
+#define w x4
+// Outer loop counter
+#define i x5
+// Inner loop counter
+#define j x6
+// Home for Montgomery multiplier
+#define d x7
+
+#define h x8
+#define e x9
+#define l x10
+#define a x11
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+// These just use i and j again, which aren't used early on.
+
+#define one x5
+#define e1 x5
+#define e2 x6
+#define e4 x5
+#define e8 x6
+
+
+S2N_BN_SYMBOL(bignum_demont):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_demont_end
+
+// Compute word-level negated modular inverse w for m == m[0].
+// This is essentially the same as word_negmodinv.
+
+        ldr     a, [m]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     one, #1
+        madd    e1, a, w, one
+        mul     e2, e1, e1
+        madd    w, e1, w, w
+        mul     e4, e2, e2
+        madd    w, e2, w, w
+        mul     e8, e4, e4
+        madd    w, e4, w, w
+        madd    w, e8, w, w
+
+// Initially just copy the input to the output. It would be a little more
+// efficient but somewhat fiddlier to tweak the zeroth iteration below instead.
+
+        mov     i, xzr
+bignum_demont_iloop:
+        ldr     a, [x, i, lsl #3]
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_demont_iloop
+
+// Outer loop, just doing a standard Montgomery reduction on z
+
+        mov     i, xzr
+bignum_demont_outerloop:
+
+        ldr     e, [z]
+        mul     d, e, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   h, d, a
+        adds    e, e, l         // Will be zero but want the carry
+        mov     j, #1
+        sub     a, k, #1
+        cbz     a, bignum_demont_montend
+bignum_demont_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        sub     l, j, #1
+        str     e, [z, l, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_demont_montloop
+bignum_demont_montend:
+        adc     h, xzr, h
+        sub     l, j, #1
+        str     h, [z, l, lsl #3]
+
+// End of outer loop
+
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_demont_outerloop
+
+// Now do a comparison of z with m to set a final correction mask
+// indicating that z >= m and so we need to subtract m.
+
+        subs    j, xzr, xzr
+bignum_demont_cmploop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        sbcs    xzr, a, e
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_demont_cmploop
+        csetm   h, cs
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        subs    j, xzr, xzr
+bignum_demont_corrloop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        and     e, e, h
+        sbcs    a, a, e
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_demont_corrloop
+
+bignum_demont_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digit.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digit.S
new file mode 100644
index 00000000000..2d261fd96f4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digit.S
@@ -0,0 +1,59 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Select digit x[n]
+// Inputs x[k], n; output function return
+//
+//    extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n);
+//
+// n'th digit of a k-digit (digit=64 bits) bignum, in constant-time style.
+// Indexing starts at 0, which is the least significant digit (little-endian).
+// Returns zero if n >= k, i.e. we read a digit off the end of the bignum.
+//
+// Standard ARM ABI: X0 = k, X1 = x, X2 = n, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_digit)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_digit)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define n x2
+
+#define d x3
+#define i x4
+#define a x5
+
+
+S2N_BN_SYMBOL(bignum_digit):
+
+// For length zero finish immediately (the return value in x0 is 0)
+
+        cbz     k, bignum_digit_end
+
+// Set default of zero, run over all the digits and take note of the n'th one
+
+        mov     d, xzr
+        mov     i, xzr
+bignum_digit_loop:
+        ldr     a, [x, i, lsl #3]
+        cmp     i, n
+        csel    d, a, d, eq
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_digit_loop
+
+// Return
+
+        mov     x0, d
+
+bignum_digit_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digitsize.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digitsize.S
new file mode 100644
index 00000000000..44d98c4e99b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digitsize.S
@@ -0,0 +1,52 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return size of bignum in digits (64-bit word)
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is 0
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_digitsize)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_digitsize)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define i x2
+#define a x3
+#define j x4
+
+
+S2N_BN_SYMBOL(bignum_digitsize):
+
+// If the bignum is zero-length, x0 is already the right answer of 0
+
+        cbz     k, bignum_digitsize_end
+
+// Run over the words j = 0..i-1, and set i := j + 1 when hitting nonzero a[j]
+
+        mov     i, xzr
+        mov     j, xzr
+bignum_digitsize_loop:
+        ldr     a, [x, j, lsl #3]
+        add     j, j, #1
+        cmp     a, #0
+        csel    i, j, i, ne
+        cmp     j, k
+        bne     bignum_digitsize_loop
+
+        mov     x0, i
+bignum_digitsize_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_divmod10.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_divmod10.S
new file mode 100644
index 00000000000..2d3515217e1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_divmod10.S
@@ -0,0 +1,79 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Divide bignum by 10, returning remainder: z' := z div 10, return = z mod 10
+// Inputs z[k]; outputs function return (remainder) and z[k]
+//
+//    extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z);
+//
+// Standard ARM ABI: X0 = k, X1 = z, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_divmod10)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_divmod10)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+
+#define d x2
+
+#define h x3
+#define q x3
+
+#define l x4
+#define r x4
+
+#define w x5
+#define s x6
+
+S2N_BN_SYMBOL(bignum_divmod10):
+
+// If k = 0 then return; the return in x0 is indeed 0 mod 10 = 0
+
+        cbz     k, bignum_divmod10_end
+
+// Straightforward top-down loop doing 10 * q + r' := 2^64 * r + d
+
+        mov     r, xzr
+        mov     w, 0x3333333333333333
+        add     s, w, 1
+        and     w, w, 0xfffffff
+bignum_divmod10_divloop:
+        sub     k, k, 1
+        ldr     d, [z, k, lsl #3]
+
+// First re-split and shift so 2^28 * h + l = (2^64 * r + d) / 2
+// Then (2^64 * r + d) / 10 = [(2^28 - 1) / 5] * h + (h + l) / 5
+
+        extr    h, r, d, 29
+        ubfx    l, d, 1, 28
+
+        add     l, h, l
+
+        mul     h, h, w
+        umulh   l, l, s
+        add     q, h, l
+        str     q, [z, k, lsl #3]
+
+// Generate the new remainder r = d - 10 * q
+// Since r <= 9 we only need the low part computation ignoring carries
+
+        add     q, q, q, lsl #2
+        sub     r, d, q, lsl #1
+
+        cbnz    k, bignum_divmod10_divloop
+
+// Return the final remainder
+
+        mov     x0, r
+
+bignum_divmod10_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_emontredc.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_emontredc.S
new file mode 100644
index 00000000000..ebd6a364549
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_emontredc.S
@@ -0,0 +1,109 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended Montgomery reduce, returning results in input-output buffer
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+//
+//    extern uint64_t bignum_emontredc
+//     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+//
+// Assumes that z initially holds a 2k-digit bignum z_0, m is a k-digit odd
+// bignum and m * w == -1 (mod 2^64). This function also uses z for the output
+// as well as returning a carry c of 0 or 1. This encodes two numbers: in the
+// lower half of the z buffer we have q = z[0..k-1], while the upper half
+// together with the carry gives r = 2^{64k}*c + z[k..2k-1]. These values
+// satisfy z_0 + q * m = 2^{64k} * r, i.e. r gives a raw (unreduced) Montgomery
+// reduction while q gives the multiplier that was used. Another way of
+// thinking of it is that if z' is the output z with the lower half replaced
+// with zeros, then z_0 + q * m = 2^{128k} * c + z'.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define m x2
+#define w x3
+
+// Outer loop counter
+#define i x4
+// Inner loop counter
+#define j x5
+// Home for Montgomery multiplier
+#define d x6
+
+// Top carry for current window
+#define c x7
+
+#define h x8
+#define e x9
+#define l x10
+#define a x11
+
+
+S2N_BN_SYMBOL(bignum_emontredc):
+
+// If k = 0 the whole operation is trivial; note we also get a return of c = 0
+
+        cbz     k, bignum_emontredc_end
+
+// Initialize top carry to zero, and launch into the outer loop
+
+        mov     c, xzr
+        mov     i, xzr
+bignum_emontredc_outerloop:
+
+        ldr     e, [z]
+        mul     d, e, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   h, d, a
+        str     d, [z]
+        adds    xzr, e, l       // Will be zero but want the carry
+        mov     j, #1
+        sub     a, k, #1
+        cbz     a, bignum_emontredc_montend
+bignum_emontredc_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        str     e, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_emontredc_montloop
+bignum_emontredc_montend:
+        adcs    h, h, c
+        adc     c, xzr, xzr
+        ldr     a, [z, k, lsl #3]
+        adds    h, h, a
+        adc     c, c, xzr
+        str     h, [z, k, lsl #3]
+
+// End of outer loop
+
+        add     z, z, #8         // For simple indexing, z pointer moves
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_emontredc_outerloop
+
+// Return c in X0
+
+        mov     x0, c
+
+bignum_emontredc_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_eq.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_eq.S
new file mode 100644
index 00000000000..91efacc1511
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_eq.S
@@ -0,0 +1,82 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignums for equality, x = y
+// Inputs x[m], y[n]; output function return
+//
+//    extern uint64_t bignum_eq
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_eq)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_eq)
+        .text
+        .balign 4
+
+#define m x0
+#define x x1
+#define n x2
+#define y x3
+#define a x4
+#define c x5
+//  We can re-use n for this, not needed when d appears
+#define d x2
+
+
+S2N_BN_SYMBOL(bignum_eq):
+
+// Initialize the accumulated OR of differences to zero
+
+        mov     c, xzr
+
+// If m >= n jump into the m > n loop at the final equality test
+// This will drop through for m = n
+
+        cmp     m, n
+        bcs     bignum_eq_mtest
+
+// Toploop for the case n > m
+
+bignum_eq_nloop:
+        sub     n, n, #1
+        ldr     a, [y, n, lsl #3]
+        orr     c, c, a
+        cmp     m, n
+        bne     bignum_eq_nloop
+        b       bignum_eq_mmain
+
+// Toploop for the case m > n (or n = m which enters at "mtest")
+
+bignum_eq_mloop:
+        sub     m, m, #1
+        ldr     a, [x, m, lsl #3]
+        orr     c, c, a
+        cmp     m, n
+bignum_eq_mtest:
+        bne     bignum_eq_mloop
+
+// Combined main loop for the min(m,n) lower words
+
+bignum_eq_mmain:
+        cbz     m, bignum_eq_end
+
+bignum_eq_loop:
+        sub     m, m, #1
+        ldr     a, [x, m, lsl #3]
+        ldr     d, [y, m, lsl #3]
+        eor     a, a, d
+        orr     c, c, a
+        cbnz    m, bignum_eq_loop
+
+bignum_eq_end:
+        cmp     c, xzr
+        cset    x0, eq
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_even.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_even.S
new file mode 100644
index 00000000000..16700254ebd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_even.S
@@ -0,0 +1,31 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignum for even-ness
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_even (uint64_t k, uint64_t *x);
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_even)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_even)
+        .text
+        .balign 4
+
+S2N_BN_SYMBOL(bignum_even):
+
+cbz     x0, bignum_even_end                 // if k = 0, that's the return!
+        ldr     x0, [x1]
+        and     x0, x0, #1
+
+bignum_even_end:
+        eor     x0, x0, #1
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/generic/bignum_ge.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ge.S
similarity index 100%
rename from third_party/s2n-bignum/arm/generic/bignum_ge.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ge.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_gt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_gt.S
new file mode 100644
index 00000000000..343d53fb119
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_gt.S
@@ -0,0 +1,89 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compare bignums, x > y
+// Inputs x[m], y[n]; output function return
+//
+//    extern uint64_t bignum_gt
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_gt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_gt)
+        .text
+        .balign 4
+
+#define m x0
+#define x x1
+#define n x2
+#define y x3
+#define i x4
+#define a x5
+#define d x6
+
+
+S2N_BN_SYMBOL(bignum_gt):
+
+// Zero the main index counter for both branches
+
+        mov     i, xzr
+
+// Speculatively form n := n - m and do case split
+
+        subs    n, n, m
+        bcc     bignum_gt_ylonger
+
+// The case where y is longer or of the same size (n >= m)
+// Note that CF=1 initially by the fact that we reach this point
+
+        cbz     m, bignum_gt_xtest
+bignum_gt_xmainloop:
+        ldr     a, [y, i, lsl #3]
+        ldr     d, [x, i, lsl #3]
+        sbcs    xzr, a, d
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_gt_xmainloop
+bignum_gt_xtest:
+        cbz     n, bignum_gt_xskip
+bignum_gt_xtoploop:
+        ldr     a, [y, i, lsl #3]
+        sbcs    xzr, a, xzr
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_gt_xtoploop
+bignum_gt_xskip:
+        cset    x0, cc
+        ret
+
+// The case where x is longer (m > n)
+// The first "adds" also makes sure CF=1 initially in this branch
+
+bignum_gt_ylonger:
+        adds    n, n, m
+        cbz     n, bignum_gt_ytoploop
+        sub     m, m, n
+bignum_gt_ymainloop:
+        ldr     a, [y, i, lsl #3]
+        ldr     d, [x, i, lsl #3]
+        sbcs    xzr, a, d
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_gt_ymainloop
+bignum_gt_ytoploop:
+        ldr     a, [x, i, lsl #3]
+        sbcs    xzr, xzr, a
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_gt_ytoploop
+
+        cset    x0, cc
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_iszero.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_iszero.S
new file mode 100644
index 00000000000..c65c026de3b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_iszero.S
@@ -0,0 +1,43 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignum for zero-ness, x = 0
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_iszero (uint64_t k, uint64_t *x);
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_iszero)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_iszero)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define a x2
+#define c x3
+
+
+S2N_BN_SYMBOL(bignum_iszero):
+
+mov     c, xzr                  // c will be or of the digits
+        cbz     k, bignum_iszero_end                  // if k = 0 skip the bignum_iszero_loop
+
+bignum_iszero_loop:
+        sub     k, k, #1
+        ldr     a, [x, k, lsl #3]
+        orr     c, c, a
+        cbnz    k, bignum_iszero_loop
+
+bignum_iszero_end:
+        cmp     c, xzr
+        cset    x0, eq
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_le.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_le.S
new file mode 100644
index 00000000000..243f81c99f9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_le.S
@@ -0,0 +1,89 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compare bignums, x <= y
+// Inputs x[m], y[n]; output function return
+//
+//    extern uint64_t bignum_le
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_le)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_le)
+        .text
+        .balign 4
+
+#define m x0
+#define x x1
+#define n x2
+#define y x3
+#define i x4
+#define a x5
+#define d x6
+
+
+S2N_BN_SYMBOL(bignum_le):
+
+// Zero the main index counter for both branches
+
+        mov     i, xzr
+
+// Speculatively form n := n - m and do case split
+
+        subs    n, n, m
+        bcc     bignum_le_ylonger
+
+// The case where y is longer or of the same size (n >= m)
+// Note that CF=1 initially by the fact that we reach this point
+
+        cbz     m, bignum_le_xtest
+bignum_le_xmainloop:
+        ldr     a, [y, i, lsl #3]
+        ldr     d, [x, i, lsl #3]
+        sbcs    xzr, a, d
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_le_xmainloop
+bignum_le_xtest:
+        cbz     n, bignum_le_xskip
+bignum_le_xtoploop:
+        ldr     a, [y, i, lsl #3]
+        sbcs    xzr, a, xzr
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_le_xtoploop
+bignum_le_xskip:
+        cset    x0, cs
+        ret
+
+// The case where x is longer (m > n)
+// The first "adds" also makes sure CF=1 initially in this branch
+
+bignum_le_ylonger:
+        adds    n, n, m
+        cbz     n, bignum_le_ytoploop
+        sub     m, m, n
+bignum_le_ymainloop:
+        ldr     a, [y, i, lsl #3]
+        ldr     d, [x, i, lsl #3]
+        sbcs    xzr, a, d
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_le_ymainloop
+bignum_le_ytoploop:
+        ldr     a, [x, i, lsl #3]
+        sbcs    xzr, xzr, a
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_le_ytoploop
+
+        cset    x0, cs
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_lt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_lt.S
new file mode 100644
index 00000000000..554bfec2aa6
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_lt.S
@@ -0,0 +1,89 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compare bignums, x < y
+// Inputs x[m], y[n]; output function return
+//
+//    extern uint64_t bignum_lt
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_lt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_lt)
+        .text
+        .balign 4
+
+#define m x0
+#define x x1
+#define n x2
+#define y x3
+#define i x4
+#define a x5
+#define d x6
+
+
+S2N_BN_SYMBOL(bignum_lt):
+
+// Zero the main index counter for both branches
+
+        mov     i, xzr
+
+// Speculatively form m := m - n and do case split
+
+        subs    m, m, n
+        bcc     bignum_lt_ylonger
+
+// The case where x is longer or of the same size (m >= n)
+// Note that CF=1 initially by the fact that we reach this point
+
+        cbz     n, bignum_lt_xtest
+bignum_lt_xmainloop:
+        ldr     a, [x, i, lsl #3]
+        ldr     d, [y, i, lsl #3]
+        sbcs    xzr, a, d
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_lt_xmainloop
+bignum_lt_xtest:
+        cbz     m, bignum_lt_xskip
+bignum_lt_xtoploop:
+        ldr     a, [x, i, lsl #3]
+        sbcs    xzr, a, xzr
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_lt_xtoploop
+bignum_lt_xskip:
+        cset    x0, cc
+        ret
+
+// The case where y is longer (n > m)
+// The first "adds" also makes sure CF=1 initially in this branch
+
+bignum_lt_ylonger:
+        adds    m, m, n
+        cbz     m, bignum_lt_ytoploop
+        sub     n, n, m
+bignum_lt_ymainloop:
+        ldr     a, [x, i, lsl #3]
+        ldr     d, [y, i, lsl #3]
+        sbcs    xzr, a, d
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_lt_ymainloop
+bignum_lt_ytoploop:
+        ldr     a, [y, i, lsl #3]
+        sbcs    xzr, xzr, a
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_lt_ytoploop
+
+        cset    x0, cc
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_madd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_madd.S
new file mode 100644
index 00000000000..b956e7c7f6e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_madd.S
@@ -0,0 +1,124 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply-add, z := z + x * y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_madd
+//     (uint64_t k, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the "z := x * y + z" operation, while also returning a "next" or
+// "carry" word. In the case where m + n <= p (i.e. the pure product would
+// fit in the destination) this is the remainder for the exact result.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = x, X4 = n, X5 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_madd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_madd)
+        .text
+        .balign 4
+
+#define p x0
+#define z x1
+#define m x2
+#define x x3
+#define n x4
+#define y x5
+#define l x6
+#define h x7
+#define c x8
+#define k x9
+#define i x10
+#define a x11
+#define b x12
+#define d x13
+#define xx x14
+#define yy x15
+
+
+S2N_BN_SYMBOL(bignum_madd):
+
+// If p = 0 the result is trivial and nothing needs doing
+// Note that fortuitously our "carry/remainder" term is still right!
+// As it's a multiply-add, could also do this if either argument is trivial
+
+        cbz     p, bignum_madd_end
+
+// initialize (h,l) = 0, saving c = 0 for inside the loop
+
+        mov     l, xzr
+        mov     h, xzr
+
+// Iterate outer loop from k = 0 ... k = p - 1 producing result digits
+
+        mov     k, xzr
+bignum_madd_outerloop:
+
+// Add the existing z[k] and (h,l) to get initial (c,h,l) combination
+
+        ldr     c, [z, k, lsl #3]
+        adds    l, l, c
+        adcs    h, h, xzr
+        adc     c, xzr, xzr
+
+// First let a = MAX 0 (k + 1 - n) and b = MIN (k + 1) m
+// We want to accumulate all x[i] * y[k - i] for a <= i < b
+
+        add     a, k, #1
+        cmp     a, m
+        csel    b, a, m, cc
+        subs    a, a, n
+        csel    a, a, xzr, cs
+
+// Set loop count i = b - a, and skip everything if it's <= 0
+
+        subs    i, b, a
+        bls     bignum_madd_innerend
+
+// Use temporary pointers xx = x + 8 * a and yy = y + 8 * (k - b)
+// Increment xx per iteration but just use loop counter with yy
+// So we start with [xx] = x[a] and [yy] = y[(k - b) + (b - a)] = y[k - a]
+
+        lsl     xx, a, #3
+        add     xx, xx, x
+
+        sub     yy, k, b
+        lsl     yy, yy, #3
+        add     yy, yy, y
+
+// And index using the loop counter i = b - a, ..., i = 1
+
+bignum_madd_innerloop:
+        ldr     a, [xx], #8
+        ldr     b, [yy, i, lsl #3]
+        mul     d, a, b
+        umulh   a, a, b
+        adds    l, l, d
+        adcs    h, h, a
+        adc     c, c, xzr
+        subs    i, i, #1
+        bne     bignum_madd_innerloop
+
+bignum_madd_innerend:
+        str     l, [z, k, lsl #3]
+        mov     l, h
+        mov     h, c
+
+        add     k, k, #1
+        cmp     k, p
+        bcc     bignum_madd_outerloop                       // Inverted carry flag!
+
+// Return the "carry/remainder" term
+
+        mov     x0, l
+
+bignum_madd_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modadd.S
new file mode 100644
index 00000000000..d23d1e101ed
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modadd.S
@@ -0,0 +1,83 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo m, z := (x + y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modadd
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = y, X4 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define y x3
+#define m x4
+#define i x5
+#define j x6
+#define a x7
+#define b x8
+#define c x9
+
+
+S2N_BN_SYMBOL(bignum_modadd):
+
+adds    j, k, xzr               // j = k and ZF = (k = 0)
+        beq     bignum_modadd_end                     // if k = 0 do nothing
+        adds    i, xzr, xzr             // i = 0 and CF = 0
+
+// First just add (c::z) := x + y
+
+bignum_modadd_addloop:
+        ldr     a, [x, i]
+        ldr     b, [y, i]
+        adcs    a, a, b
+        str     a, [z, i]
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_modadd_addloop
+        cset    c, cs
+
+// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m
+
+        mov     j, k
+        subs    i, xzr, xzr
+bignum_modadd_cmploop:
+        ldr     a, [z, i]
+        ldr     b, [m, i]
+        sbcs    xzr, a, b
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_modadd_cmploop
+        sbcs    c, c, xzr
+        mvn     c, c
+
+// Now do a masked subtraction z := z - [c] * m
+
+        mov     j, k
+        subs    i, xzr, xzr
+bignum_modadd_subloop:
+        ldr     a, [z, i]
+        ldr     b, [m, i]
+        and     b, b, c
+        sbcs    a, a, b
+        str     a, [z, i]
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_modadd_subloop
+
+bignum_modadd_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_moddouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_moddouble.S
new file mode 100644
index 00000000000..286e5ea43eb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_moddouble.S
@@ -0,0 +1,72 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo m, z := (2 * x) mod m, assuming x reduced
+// Inputs x[k], m[k]; output z[k]
+//
+//    extern void bignum_moddouble
+//      (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_moddouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_moddouble)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define m x3
+#define i x4
+#define j x5
+#define a x6
+#define b x7
+#define c x8
+
+
+S2N_BN_SYMBOL(bignum_moddouble):
+
+adds    j, k, xzr               // j = k and ZF = (k = 0)
+        beq     bignum_moddouble_end                     // if k = 0 do nothing
+
+// Do (_::z) = 2 * x - m and generate a mask in c for 2 * x < m
+
+        mov     c, xzr
+        subs    i, xzr, xzr             // i = 0 and CF = 1
+bignum_moddouble_dubloop:
+        ldr     a, [x, i]
+        extr    c, a, c, #63
+        ldr     b, [m, i]
+        sbcs    c, c, b
+        str     c, [z, i]
+        mov     c, a
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_moddouble_dubloop
+        lsr     c, c, #63
+        sbc     c, c, xzr
+
+// Now do a corrective masked addition z := z + [c] * m
+
+        mov     j, k
+        adds    i, xzr, xzr
+bignum_moddouble_corrloop:
+        ldr     a, [z, i]
+        ldr     b, [m, i]
+        and     b, b, c
+        adcs    a, a, b
+        str     a, [z, i]
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_moddouble_corrloop
+
+bignum_moddouble_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modexp.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modexp.S
new file mode 100644
index 00000000000..761b6e64f30
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modexp.S
@@ -0,0 +1,540 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular exponentiation for arbitrary odd modulus
+// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k]
+//
+//   extern void bignum_modexp
+//    (uint64_t k,uint64_t *z, uint64_t *a,uint64_t *p,uint64_t *m,uint64_t *t);
+//
+// Does z := (a^p) mod m where all numbers are k-digit and m is odd
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = a, X3 = p, X4 = m, X5 = t
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modexp)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modexp)
+        .text
+        .balign 4
+
+// Local variables, all held in extra registers
+
+#define k x19
+#define res x20
+#define a x21
+#define p x22
+#define m x23
+#define x x24
+#define i x25
+#define y x, k, lsl #3
+#define z x, k, lsl #4
+
+S2N_BN_SYMBOL(bignum_modexp):
+
+// Save some registers including link register
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x30, [sp, #-16]!
+
+// If size is zero (which falsifies the oddness condition) do nothing
+
+        cbz     x0, bignum_modexp_end
+
+// Move parameters into permanent homes
+
+        mov     k, x0
+        mov     res, x1
+        mov     a, x2
+        mov     p, x3
+        mov     m, x4
+        mov     x, x5
+
+// Let x == 2^64k * a (mod m) and initialize z == 2^64k * 1 (mod m)
+
+        mov     x0, k
+        add     x1, z
+        mov     x2, m
+        add     x3, y
+        bl bignum_modexp_local_amontifier
+
+        mov     x0, k
+        mov     x1, x
+        add     x2, z
+        mov     x3, a
+        mov     x4, m
+        bl bignum_modexp_local_amontmul
+
+        mov     x0, k
+        add     x1, z
+        add     x2, z
+        mov     x3, m
+        bl bignum_modexp_local_demont
+
+// Main loop with z == 2^64k * a^(p >> 2^i) (mod m)
+
+        lsl     i, k, #6
+
+bignum_modexp_loop:
+        sub     i, i, #1
+
+        mov     x0, k
+        add     x1, y
+        add     x2, z
+        add     x3, z
+        mov     x4, m
+        bl      bignum_modexp_local_amontmul
+
+        mov     x0, k
+        add     x1, z
+        mov     x2, x
+        add     x3, y
+        mov     x4, m
+        bl      bignum_modexp_local_amontmul
+
+        lsr     x0, i, #6
+        ldr     x0, [p, x0, lsl #3]
+        lsr     x0, x0, i
+        and     x0, x0, #1
+
+        mov     x1, k
+        add     x2, z
+        add     x3, z
+        add     x4, y
+        bl      bignum_modexp_local_mux
+
+        cbnz    i, bignum_modexp_loop
+
+// Convert back from Montgomery representation and copy the result
+// (via a degenerate case of multiplexing) into the output buffer
+
+        mov     x0, k
+        add     x1, z
+        add     x2, z
+        mov     x3, m
+        bl      bignum_modexp_local_demont
+
+        mov     x0, xzr
+        mov     x1, k
+        mov     x2, res
+        add     x3, z
+        add     x4, z
+        bl      bignum_modexp_local_mux
+
+// Restore registers and return
+
+bignum_modexp_end:
+
+        ldp     x25, x30, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+// Local copy of bignum_amontifier
+
+bignum_modexp_local_amontifier:
+        cbz     x0, bignum_modexp_amontifend
+        mov     x4, xzr
+bignum_modexp_copyinloop:
+        ldr     x9, [x2, x4, lsl #3]
+        str     x9, [x3, x4, lsl #3]
+        add     x4, x4, #0x1
+        cmp     x4, x0
+        b.cc    bignum_modexp_copyinloop
+        subs    x4, x0, #0x1
+        b.eq    bignum_modexp_normalized
+bignum_modexp_normloop:
+        mov     x5, xzr
+        cmp     x9, xzr
+        mov     x7, xzr
+bignum_modexp_shufloop:
+        mov     x9, x7
+        ldr     x7, [x3, x5, lsl #3]
+        csel    x9, x9, x7, eq
+        str     x9, [x3, x5, lsl #3]
+        add     x5, x5, #0x1
+        sub     x11, x5, x0
+        cbnz    x11, bignum_modexp_shufloop
+        subs    x4, x4, #0x1
+        b.ne    bignum_modexp_normloop
+bignum_modexp_normalized:
+        clz     x9, x9
+        mov     x10, xzr
+        mov     x4, xzr
+        tst     x9, #0x3f
+        csetm   x8, ne
+        neg     x11, x9
+bignum_modexp_bitloop:
+        ldr     x5, [x3, x4, lsl #3]
+        lsl     x7, x5, x9
+        orr     x7, x7, x10
+        lsr     x10, x5, x11
+        and     x10, x10, x8
+        str     x7, [x3, x4, lsl #3]
+        add     x4, x4, #0x1
+        cmp     x4, x0
+        b.cc    bignum_modexp_bitloop
+        sub     x6, x0, #0x1
+        ldr     x6, [x3, x6, lsl #3]
+        mov     x11, #0x1
+        neg     x10, x6
+        mov     x4, #0x3e
+bignum_modexp_estloop:
+        add     x11, x11, x11
+        mov     x7, x6
+        sub     x7, x7, x10
+        cmp     x10, x7
+        csetm   x7, cs
+        sub     x11, x11, x7
+        add     x10, x10, x10
+        and     x7, x7, x6
+        sub     x10, x10, x7
+        subs    x4, x4, #0x1
+        b.ne    bignum_modexp_estloop
+        cmp     x10, x6
+        cinc    x11, x11, eq
+        mov     x9, xzr
+        adds    x4, xzr, xzr
+bignum_modexp_mulloop:
+        ldr     x7, [x3, x4, lsl #3]
+        mul     x8, x11, x7
+        adcs    x8, x8, x9
+        umulh   x9, x11, x7
+        str     x8, [x1, x4, lsl #3]
+        add     x4, x4, #0x1
+        sub     x7, x4, x0
+        cbnz    x7, bignum_modexp_mulloop
+        adc     x9, x9, xzr
+        mov     x7, #0x4000000000000000
+        subs    x9, x9, x7
+        csetm   x11, cs
+        negs    x4, xzr
+bignum_modexp_remloop:
+        ldr     x7, [x3, x4, lsl #3]
+        ldr     x10, [x1, x4, lsl #3]
+        and     x7, x7, x11
+        sbcs    x7, x7, x10
+        str     x7, [x1, x4, lsl #3]
+        add     x4, x4, #0x1
+        sub     x7, x4, x0
+        cbnz    x7, bignum_modexp_remloop
+        mov     x9, xzr
+        negs    x5, xzr
+bignum_modexp_dubloop1:
+        ldr     x7, [x1, x5, lsl #3]
+        extr    x9, x7, x9, #63
+        ldr     x10, [x3, x5, lsl #3]
+        sbcs    x9, x9, x10
+        str     x9, [x1, x5, lsl #3]
+        mov     x9, x7
+        add     x5, x5, #0x1
+        sub     x7, x5, x0
+        cbnz    x7, bignum_modexp_dubloop1
+        lsr     x9, x9, #63
+        sbc     x9, x9, xzr
+        adds    x5, xzr, xzr
+bignum_modexp_corrloop1:
+        ldr     x7, [x1, x5, lsl #3]
+        ldr     x10, [x3, x5, lsl #3]
+        and     x10, x10, x9
+        adcs    x7, x7, x10
+        str     x7, [x1, x5, lsl #3]
+        add     x5, x5, #0x1
+        sub     x7, x5, x0
+        cbnz    x7, bignum_modexp_corrloop1
+        mov     x9, xzr
+        negs    x5, xzr
+bignum_modexp_dubloop2:
+        ldr     x7, [x1, x5, lsl #3]
+        extr    x9, x7, x9, #63
+        ldr     x10, [x3, x5, lsl #3]
+        sbcs    x9, x9, x10
+        str     x9, [x1, x5, lsl #3]
+        mov     x9, x7
+        add     x5, x5, #0x1
+        sub     x7, x5, x0
+        cbnz    x7, bignum_modexp_dubloop2
+        lsr     x9, x9, #63
+        sbc     x9, x9, xzr
+        adds    x5, xzr, xzr
+bignum_modexp_corrloop2:
+        ldr     x7, [x1, x5, lsl #3]
+        ldr     x10, [x3, x5, lsl #3]
+        and     x10, x10, x9
+        adcs    x7, x7, x10
+        str     x7, [x1, x5, lsl #3]
+        str     x7, [x3, x5, lsl #3]
+        add     x5, x5, #0x1
+        sub     x7, x5, x0
+        cbnz    x7, bignum_modexp_corrloop2
+        mov     x6, xzr
+        mov     x4, x0
+bignum_modexp_modloop:
+        mov     x5, xzr
+        mov     x10, xzr
+        adds    x9, xzr, xzr
+bignum_modexp_cmaloop:
+        ldr     x7, [x1, x5, lsl #3]
+        mul     x8, x6, x7
+        adcs    x10, x10, x9
+        umulh   x9, x6, x7
+        adc     x9, x9, xzr
+        adds    x8, x10, x8
+        ldr     x10, [x3, x5, lsl #3]
+        str     x8, [x3, x5, lsl #3]
+        add     x5, x5, #0x1
+        sub     x7, x5, x0
+        cbnz    x7, bignum_modexp_cmaloop
+        adcs    x6, x10, x9
+        csetm   x8, cs
+        adds    x5, xzr, xzr
+bignum_modexp_oaloop:
+        ldr     x7, [x3, x5, lsl #3]
+        ldr     x10, [x1, x5, lsl #3]
+        and     x10, x10, x8
+        adcs    x7, x7, x10
+        str     x7, [x3, x5, lsl #3]
+        add     x5, x5, #0x1
+        sub     x7, x5, x0
+        cbnz    x7, bignum_modexp_oaloop
+        adc     x6, x6, xzr
+        subs    x4, x4, #0x1
+        b.ne    bignum_modexp_modloop
+        ldr     x7, [x2]
+        lsl     x11, x7, #2
+        sub     x11, x7, x11
+        eor     x11, x11, #0x2
+        mov     x8, #0x1
+        madd    x9, x7, x11, x8
+        mul     x10, x9, x9
+        madd    x11, x9, x11, x11
+        mul     x9, x10, x10
+        madd    x11, x10, x11, x11
+        mul     x10, x9, x9
+        madd    x11, x9, x11, x11
+        madd    x11, x10, x11, x11
+        ldr     x10, [x3]
+        mul     x11, x10, x11
+        mul     x8, x11, x7
+        umulh   x9, x11, x7
+        mov     x5, #0x1
+        sub     x7, x0, #0x1
+        cmn     x10, x8
+        cbz     x7, bignum_modexp_montifend
+bignum_modexp_montifloop:
+        ldr     x7, [x2, x5, lsl #3]
+        ldr     x10, [x3, x5, lsl #3]
+        mul     x8, x11, x7
+        adcs    x10, x10, x9
+        umulh   x9, x11, x7
+        adc     x9, x9, xzr
+        adds    x10, x10, x8
+        sub     x7, x5, #0x1
+        str     x10, [x3, x7, lsl #3]
+        add     x5, x5, #0x1
+        sub     x7, x5, x0
+        cbnz    x7, bignum_modexp_montifloop
+bignum_modexp_montifend:
+        adcs    x6, x6, x9
+        csetm   x8, cs
+        sub     x7, x0, #0x1
+        str     x6, [x3, x7, lsl #3]
+        negs    x5, xzr
+bignum_modexp_osloop:
+        ldr     x7, [x3, x5, lsl #3]
+        ldr     x10, [x2, x5, lsl #3]
+        and     x10, x10, x8
+        sbcs    x7, x7, x10
+        str     x7, [x1, x5, lsl #3]
+        add     x5, x5, #0x1
+        sub     x7, x5, x0
+        cbnz    x7, bignum_modexp_osloop
+bignum_modexp_amontifend:
+        ret
+
+// Local copy of bignum_amontmul
+
+bignum_modexp_local_amontmul:
+        cbz     x0, bignum_modexp_amomend
+        ldr     x14, [x4]
+        lsl     x5, x14, #2
+        sub     x5, x14, x5
+        eor     x5, x5, #0x2
+        mov     x6, #0x1
+        madd    x6, x14, x5, x6
+        mul     x7, x6, x6
+        madd    x5, x6, x5, x5
+        mul     x6, x7, x7
+        madd    x5, x7, x5, x5
+        mul     x7, x6, x6
+        madd    x5, x6, x5, x5
+        madd    x5, x7, x5, x5
+        mov     x8, xzr
+bignum_modexp_zoop:
+        str     xzr, [x1, x8, lsl #3]
+        add     x8, x8, #0x1
+        cmp     x8, x0
+        b.cc    bignum_modexp_zoop
+        mov     x6, xzr
+        mov     x8, xzr
+bignum_modexp_outerloop:
+        ldr     x9, [x2, x8, lsl #3]
+        mov     x10, xzr
+        adds    x11, xzr, xzr
+bignum_modexp_maddloop:
+        ldr     x14, [x3, x10, lsl #3]
+        ldr     x12, [x1, x10, lsl #3]
+        mul     x13, x9, x14
+        adcs    x12, x12, x11
+        umulh   x11, x9, x14
+        adc     x11, x11, xzr
+        adds    x12, x12, x13
+        str     x12, [x1, x10, lsl #3]
+        add     x10, x10, #0x1
+        sub     x14, x10, x0
+        cbnz    x14, bignum_modexp_maddloop
+        adcs    x6, x6, x11
+        adc     x7, xzr, xzr
+        ldr     x12, [x1]
+        mul     x9, x12, x5
+        ldr     x14, [x4]
+        mul     x13, x9, x14
+        umulh   x11, x9, x14
+        adds    x12, x12, x13
+        mov     x10, #0x1
+        sub     x14, x0, #0x1
+        cbz     x14, bignum_modexp_montend
+bignum_modexp_montloop:
+        ldr     x14, [x4, x10, lsl #3]
+        ldr     x12, [x1, x10, lsl #3]
+        mul     x13, x9, x14
+        adcs    x12, x12, x11
+        umulh   x11, x9, x14
+        adc     x11, x11, xzr
+        adds    x12, x12, x13
+        sub     x13, x10, #0x1
+        str     x12, [x1, x13, lsl #3]
+        add     x10, x10, #0x1
+        sub     x14, x10, x0
+        cbnz    x14, bignum_modexp_montloop
+bignum_modexp_montend:
+        adcs    x11, x6, x11
+        adc     x6, x7, xzr
+        sub     x13, x10, #0x1
+        str     x11, [x1, x13, lsl #3]
+        add     x8, x8, #0x1
+        cmp     x8, x0
+        b.cc    bignum_modexp_outerloop
+        neg     x6, x6
+        negs    x10, xzr
+bignum_modexp_corrloop3:
+        ldr     x14, [x1, x10, lsl #3]
+        ldr     x12, [x4, x10, lsl #3]
+        and     x12, x12, x6
+        sbcs    x14, x14, x12
+        str     x14, [x1, x10, lsl #3]
+        add     x10, x10, #0x1
+        sub     x14, x10, x0
+        cbnz    x14, bignum_modexp_corrloop3
+bignum_modexp_amomend:
+        ret
+
+// Local copy of bignum_demont
+
+bignum_modexp_local_demont:
+        cbz     x0, bignum_modexp_demontend
+        ldr     x11, [x3]
+        lsl     x4, x11, #2
+        sub     x4, x11, x4
+        eor     x4, x4, #0x2
+        mov     x5, #0x1
+        madd    x5, x11, x4, x5
+        mul     x6, x5, x5
+        madd    x4, x5, x4, x4
+        mul     x5, x6, x6
+        madd    x4, x6, x4, x4
+        mul     x6, x5, x5
+        madd    x4, x5, x4, x4
+        madd    x4, x6, x4, x4
+        mov     x5, xzr
+bignum_modexp_iloop:
+        ldr     x11, [x2, x5, lsl #3]
+        str     x11, [x1, x5, lsl #3]
+        add     x5, x5, #0x1
+        cmp     x5, x0
+        b.cc    bignum_modexp_iloop
+        mov     x5, xzr
+bignum_modexp_douterloop:
+        ldr     x9, [x1]
+        mul     x7, x9, x4
+        ldr     x11, [x3]
+        mul     x10, x7, x11
+        umulh   x8, x7, x11
+        adds    x9, x9, x10
+        mov     x6, #0x1
+        sub     x11, x0, #0x1
+        cbz     x11, bignum_modexp_dmontend
+bignum_modexp_dmontloop:
+        ldr     x11, [x3, x6, lsl #3]
+        ldr     x9, [x1, x6, lsl #3]
+        mul     x10, x7, x11
+        adcs    x9, x9, x8
+        umulh   x8, x7, x11
+        adc     x8, x8, xzr
+        adds    x9, x9, x10
+        sub     x10, x6, #0x1
+        str     x9, [x1, x10, lsl #3]
+        add     x6, x6, #0x1
+        sub     x11, x6, x0
+        cbnz    x11, bignum_modexp_dmontloop
+bignum_modexp_dmontend:
+        adc     x8, xzr, x8
+        sub     x10, x6, #0x1
+        str     x8, [x1, x10, lsl #3]
+        add     x5, x5, #0x1
+        cmp     x5, x0
+        b.cc    bignum_modexp_douterloop
+        negs    x6, xzr
+bignum_modexp_cmploop:
+        ldr     x11, [x1, x6, lsl #3]
+        ldr     x9, [x3, x6, lsl #3]
+        sbcs    xzr, x11, x9
+        add     x6, x6, #0x1
+        sub     x11, x6, x0
+        cbnz    x11, bignum_modexp_cmploop
+        csetm   x8, cs
+        negs    x6, xzr
+bignum_modexp_corrloop:
+        ldr     x11, [x1, x6, lsl #3]
+        ldr     x9, [x3, x6, lsl #3]
+        and     x9, x9, x8
+        sbcs    x11, x11, x9
+        str     x11, [x1, x6, lsl #3]
+        add     x6, x6, #0x1
+        sub     x11, x6, x0
+        cbnz    x11, bignum_modexp_corrloop
+bignum_modexp_demontend:
+        ret
+
+// Local copy of bignum_mux
+
+bignum_modexp_local_mux:
+        cbz     x1, bignum_modexp_muxend
+        cmp     x0, #0x0
+bignum_modexp_muxloop:
+        sub     x1, x1, #0x1
+        ldr     x5, [x3, x1, lsl #3]
+        ldr     x0, [x4, x1, lsl #3]
+        csel    x5, x5, x0, ne
+        str     x5, [x2, x1, lsl #3]
+        cbnz    x1, bignum_modexp_muxloop
+bignum_modexp_muxend:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modifier.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modifier.S
new file mode 100644
index 00000000000..312293274f8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modifier.S
@@ -0,0 +1,458 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compute "modification" constant z := 2^{64k} mod m
+// Input m[k]; output z[k]; temporary buffer t[>=k]
+//
+//    extern void bignum_modifier
+//     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+//
+// The last argument points to a temporary buffer t that should have size >= k.
+// This is called "mod-ifier" because given any other k-digit number x we can
+// get x MOD m simply and reasonably efficiently just by Montgomery
+// multiplication of x and z. But one can also consider it the identity for
+// Montgomery multiplication, assuming you have a reduced multiplier already.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = t
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modifier)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modifier)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define m x2
+#define t x3
+
+// Some variables
+// Modular inverse w is aliased to i, but we never use them together
+
+#define i x4
+#define w x4
+#define j x5
+#define h x6
+#define a x7
+#define l x8
+#define c x9
+#define b x10
+#define d x11
+
+// Some aliases for the values b and d
+
+#define r x10
+#define q x11
+
+
+S2N_BN_SYMBOL(bignum_modifier):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_modifier_end
+
+// Copy the input m into the temporary buffer t. The temporary register
+// c matters since we want it to hold the highest digit, ready for the
+// normalization phase.
+
+        mov     i, xzr
+bignum_modifier_copyinloop:
+        ldr     c, [m, i, lsl #3]
+        str     c, [t, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_modifier_copyinloop
+
+// Do a rather stupid but constant-time digit normalization, conditionally
+// shifting left (k-1) times based on whether the top word is zero.
+// With careful binary striding this could be O(k*log(k)) instead of O(k^2)
+// while still retaining the constant-time style.
+// The "cmp c, xzr" sets the zeroness predicate (ZF) for the entire inner loop
+
+        subs    i, k, #1
+        beq     bignum_modifier_normalized
+bignum_modifier_normloop:
+        mov     j, xzr
+        cmp     c, xzr
+        mov     a, xzr
+bignum_modifier_shufloop:
+        mov     c, a
+        ldr     a, [t, j, lsl #3]
+        csel    c, c, a, eq
+        str     c, [t, j, lsl #3]
+        add     j, j, #1
+        sub     d, j, k
+        cbnz    d, bignum_modifier_shufloop
+        subs    i, i, #1
+        bne     bignum_modifier_normloop
+
+// We now have the top digit nonzero, assuming the input was nonzero,
+// and as per the invariant of the loop above, c holds that digit. So
+// now just count c's leading zeros and shift t bitwise that many bits.
+
+bignum_modifier_normalized:
+        clz     c, c
+
+        mov     b, xzr
+        mov     i, xzr
+        ands    xzr, c, #63
+        csetm   l, ne
+        neg     d, c
+bignum_modifier_bitloop:
+        ldr     j, [t, i, lsl #3]
+        lsl     a, j, c
+        orr     a, a, b
+        lsr     b, j, d
+        and     b, b, l
+        str     a, [t, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_modifier_bitloop
+
+// Let h be the high word of n, which in all the in-scope cases is >= 2^63.
+// Now successively form q = 2^i div h and r = 2^i mod h as i goes from
+// 64 to 126. We avoid just using division out of constant-time concerns
+// (at the least we would need to fix up h = 0 for out-of-scope inputs) and
+// don't bother with Newton-Raphson, since this stupid simple loop doesn't
+// contribute much of the overall runtime at typical sizes.
+
+        sub     h, k, #1
+        ldr     h, [t, h, lsl #3]
+        mov     q, #1
+        neg     r, h
+        mov     i, #62
+bignum_modifier_estloop:
+        add     q, q, q
+        mov     a, h
+        sub     a, a, r
+        cmp     r, a    // CF <=> r >= h - r <=> 2 * r >= h
+        csetm   a, cs
+        sub     q, q, a
+        add     r, r, r
+        and     a, a, h
+        sub     r, r, a
+        subs    i, i, #1
+        bne     bignum_modifier_estloop
+
+// Strictly speaking the above loop doesn't quite give the true remainder
+// and quotient in the special case r = h = 2^63, so fix it up. We get
+// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is
+// supererogatory, because the main property of q used below still holds
+// in this case unless the initial m = 1, and then anyway the overall
+// specification (congruence modulo m) holds degenerately. But it seems
+// nicer to get a "true" quotient and remainder.
+
+        cmp     r, h
+        csinc   q, q, q, ne
+
+// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the
+// fixed-up case above: note that we never actually use the computed
+// value of r below and so didn't adjust it). And we can assume the ranges
+// q <= 2^63 and r < h < 2^64.
+//
+// The idea is to use q as a first quotient estimate for a remainder
+// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the
+// high and low parts h and l:
+//
+// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l)
+//                  = 2^{p+62} - (2^{p-64} * (q * h) + q * l)
+//                  = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l
+//                  = 2^{p-64} * r - q * l
+//
+// Note that 2^{p-64} * r < 2^{p-64} * h <= n
+// and also  q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n
+// so |diff| = |2^{p-64} * r - q * l| < n.
+//
+// If in fact diff >= 0 then it is already 2^{p+62} mod n.
+// otherwise diff + n is the right answer.
+//
+// To (maybe?) make the computation slightly easier we actually flip
+// the sign and compute d = q * n - 2^{p+62}. Then the answer is either
+// -d (when negative) or n - d; in either case we effectively negate d.
+// This negating tweak in fact spoils the result for cases where
+// 2^{p+62} mod n = 0, when we get n instead. However the only case
+// where this can happen is m = 1, when the whole spec holds trivially,
+// and actually the remainder of the logic below works anyway since
+// the latter part of the code only needs a congruence for the k-digit
+// result, not strict modular reduction (the doublings will maintain
+// the non-strict inequality).
+
+        mov     c, xzr
+        adds    i, xzr, xzr
+bignum_modifier_mulloop:
+        ldr     a, [t, i, lsl #3]
+        mul     l, q, a
+        adcs    l, l, c
+        umulh   c, q, a
+        str     l, [z, i, lsl #3]
+        add     i, i, #1
+        sub     a, i, k
+        cbnz    a, bignum_modifier_mulloop
+
+        adc     c, c, xzr
+        mov     a, #0x4000000000000000
+        subs    c, c, a
+        csetm   q, cs
+
+// Now do [c] * n - d for our final answer
+
+        subs    i, xzr, xzr
+bignum_modifier_remloop:
+        ldr     a, [t, i, lsl #3]
+        ldr     b, [z, i, lsl #3]
+        and     a, a, q
+        sbcs    a, a, b
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     a, i, k
+        cbnz    a, bignum_modifier_remloop
+
+// Now still need to do a couple of modular doublings to get us all the
+// way up to 2^{p+64} == r from the initial 2^{p+62} == r (mod n).
+
+        mov     c, xzr
+        subs    j, xzr, xzr
+bignum_modifier_dubloop1:
+        ldr     a, [z, j, lsl #3]
+        extr    c, a, c, #63
+        ldr     b, [t, j, lsl #3]
+        sbcs    c, c, b
+        str     c, [z, j, lsl #3]
+        mov     c, a
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_dubloop1
+        lsr     c, c, #63
+        sbc     c, c, xzr
+        adds    j, xzr, xzr
+bignum_modifier_corrloop1:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [t, j, lsl #3]
+        and     b, b, c
+        adcs    a, a, b
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_corrloop1
+
+// This is not exactly the same: we also copy output to t giving the
+// initialization t_1 = r == 2^{p+64} mod n for the main loop next.
+
+        mov     c, xzr
+        subs    j, xzr, xzr
+bignum_modifier_dubloop2:
+        ldr     a, [z, j, lsl #3]
+        extr    c, a, c, #63
+        ldr     b, [t, j, lsl #3]
+        sbcs    c, c, b
+        str     c, [z, j, lsl #3]
+        mov     c, a
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_dubloop2
+        lsr     c, c, #63
+        sbc     c, c, xzr
+        adds    j, xzr, xzr
+bignum_modifier_corrloop2:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [t, j, lsl #3]
+        and     b, b, c
+        adcs    a, a, b
+        str     a, [z, j, lsl #3]
+        str     a, [t, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_corrloop2
+
+// We then successively generate (k+1)-digit values satisfying
+// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish
+// initialization by zeroing h initially
+
+        mov     h, xzr
+
+// Then if t_i = 2^{p} * h + l
+// we have t_{i+1} == 2^64 * t_i
+//         = (2^{p+64} * h) + (2^64 * l)
+//        == r * h + l<<64
+// Do this k more times so we end up == 2^{128*k+64}, one more than we want
+//
+// Writing B = 2^{64k}, the possible correction of adding r, which for
+// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r
+// would give the overall worst-case value minus q of
+// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r]
+// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required.
+//
+// This implementation makes the shift implicit by starting b with the
+// "previous" digit (initially 0) to offset things by 1.
+
+        mov     i, k
+bignum_modifier_modloop:
+        mov     j, xzr
+        mov     b, xzr
+        adds    c, xzr, xzr
+bignum_modifier_cmaloop:
+        ldr     a, [z, j, lsl #3]
+        mul     l, h, a
+        adcs    b, b, c
+        umulh   c, h, a
+        adc     c, c, xzr
+        adds    l, b, l
+        ldr     b, [t, j, lsl #3]
+        str     l, [t, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_cmaloop
+
+        adcs    h, b, c
+
+        csetm   l, cs
+
+        adds    j, xzr, xzr
+bignum_modifier_oaloop:
+        ldr     a, [t, j, lsl #3]
+        ldr     b, [z, j, lsl #3]
+        and     b, b, l
+        adcs    a, a, b
+        str     a, [t, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_oaloop
+        adc     h, h, xzr
+
+        subs    i, i, #1
+        bne     bignum_modifier_modloop
+
+// Compute the negated modular inverse w (same register as i, not used again).
+
+        ldr     a, [m]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     l, #1
+        madd    c, a, w, l
+        mul     b, c, c
+        madd    w, c, w, w
+        mul     c, b, b
+        madd    w, b, w, w
+        mul     b, c, c
+        madd    w, c, w, w
+        madd    w, b, w, w
+
+// Now do one almost-Montgomery reduction w.r.t. the original m
+// which lops off one 2^64 from the congruence and, with the usual
+// almost-Montgomery correction, gets us back inside k digits for
+// the end result.
+
+        ldr     b, [t]
+        mul     d, b, w
+
+        mul     l, d, a
+        umulh   c, d, a
+        mov     j, #1
+        sub     a, k, #1
+        adds    xzr, b, l
+        cbz     a, bignum_modifier_amontend
+
+bignum_modifier_amontloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     b, [t, j, lsl #3]
+        mul     l, d, a
+        adcs    b, b, c
+        umulh   c, d, a
+        adc     c, c, xzr
+        adds    b, b, l
+        sub     a, j, #1
+        str     b, [t, a, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_amontloop
+bignum_modifier_amontend:
+        adcs    h, h, c
+        csetm   l, cs
+        sub     a, k, #1
+        str     h, [t, a, lsl #3]
+
+        subs    j, xzr, xzr
+bignum_modifier_osloop:
+        ldr     a, [t, j, lsl #3]
+        ldr     b, [m, j, lsl #3]
+        and     b, b, l
+        sbcs    a, a, b
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_osloop
+
+// So far, the code (basically the same as bignum_amontifier) has produced
+// a k-digit value z == 2^{128k} (mod m), not necessarily fully reduced mod m.
+// We now do a short Montgomery reduction (similar to bignum_demont) so that
+// we achieve full reduction mod m while lopping 2^{64k} off the congruence.
+// We recycle h as the somewhat strangely-named outer loop counter.
+
+        mov     h, k
+
+bignum_modifier_montouterloop:
+        ldr     b, [z]
+        mul     d, b, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   c, d, a
+        mov     j, #1
+        sub     a, k, #1
+        adds    xzr, b, l
+        cbz     a, bignum_modifier_montend
+bignum_modifier_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     b, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    b, b, c
+        umulh   c, d, a
+        adc     c, c, xzr
+        adds    b, b, l
+        sub     a, j, #1
+        str     b, [z, a, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_montloop
+bignum_modifier_montend:
+        adc     c, c, xzr
+        sub     a, k, #1
+        str     c, [z, a, lsl #3]
+
+        subs    h, h, #1
+        bne     bignum_modifier_montouterloop
+
+// Now do a comparison of z with m to set a final correction mask
+// indicating that z >= m and so we need to subtract m.
+
+        subs    j, xzr, xzr
+bignum_modifier_cmploop:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [m, j, lsl #3]
+        sbcs    xzr, a, b
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_cmploop
+        csetm   h, cs
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        subs    j, xzr, xzr
+bignum_modifier_corrloop:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [m, j, lsl #3]
+        and     b, b, h
+        sbcs    a, a, b
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_modifier_corrloop
+
+bignum_modifier_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modinv.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modinv.S
new file mode 100644
index 00000000000..b34ab65b382
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modinv.S
@@ -0,0 +1,608 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, coprime a
+// Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k]
+//
+//    extern void bignum_modinv
+//     (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t);
+//
+// k-digit (digit=64 bits) "z := a^-1 mod b" (modular inverse of a modulo b)
+// using t as a temporary buffer (t at least 3*k words = 24*k bytes), and
+// assuming that a and b are coprime *and* that b is an odd number > 1.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = a, X3 = b, X4 = t
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modinv)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modinv)
+        .text
+        .balign 4
+
+// We get CHUNKSIZE bits per outer iteration, 64 minus a few for proxy errors
+
+#define CHUNKSIZE 58
+
+// Pervasive variables
+
+#define k x0
+#define z x1
+#define b x3
+#define w x4
+
+// This one is recycled after initial copying in of a as outer loop counter
+
+#define a x2
+#define t x2
+
+// Additional variables; later ones are currently rather high regs
+
+#define l x5
+
+#define m x21
+#define n x22
+
+// The matrix of update factors to apply to m and n
+// Also used a couple of additional temporary variables for the swapping loop
+// Also used as an extra down-counter in corrective negation loops
+
+#define m_m x6
+#define m_n x7
+#define n_m x8
+#define n_n x9
+
+#define j x6
+
+// General temporary variables and loop counters
+
+#define i x10
+#define t1 x11
+#define t2 x12
+
+// High and low proxies for the inner loop
+// Then re-used for high and carry words during actual cross-multiplications
+
+#define m_hi x13
+#define n_hi x14
+#define m_lo x15
+#define n_lo x16
+
+#define h1 x13
+#define h2 x14
+#define l1 x15
+#define l2 x16
+
+#define c1 x17
+#define c2 x19
+
+// Negated modular inverse for Montgomery
+
+#define v x20
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+// These just use t1 and t2 again, though carefully since t1 = initial b[0]
+
+#define one t2
+#define e1 t2
+#define e2 t1
+#define e4 t2
+#define e8 t1
+
+S2N_BN_SYMBOL(bignum_modinv):
+
+// We make use of registers beyond the modifiable
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+
+// If k = 0 then do nothing (this is out of scope anyway)
+
+        cbz     k, bignum_modinv_end
+
+// Set up the additional two buffers m and n beyond w in temp space
+
+        lsl     i, k, #3
+        add     m, w, i
+        add     n, m, i
+
+// Initialize the main buffers with their starting values:
+// m = a, n = b, w = b (to be tweaked to b - 1) and z = 0
+
+        mov     i, xzr
+bignum_modinv_copyloop:
+        ldr     t1, [a, i, lsl #3]
+        ldr     t2, [b, i, lsl #3]
+        str     t1, [m, i, lsl #3]
+        str     t2, [n, i, lsl #3]
+        str     t2, [w, i, lsl #3]
+        str     xzr, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_modinv_copyloop
+
+// Tweak down w to b - 1 (this crude approach is safe as b needs to be odd
+// for it to be in scope). We have then established the congruence invariant:
+//
+//   a * w == -m (mod b)
+//   a * z == n (mod b)
+//
+// This, with the bound w <= b and z <= b, is maintained round the outer loop
+
+        ldr     t1, [w]
+        sub     t2, t1, #1
+        str     t2, [w]
+
+// Compute v = negated modular inverse of b mod 2^64, reusing t1 from above
+// This is used for Montgomery reduction operations each time round the loop
+
+        lsl     v, t1, #2
+        sub     v, t1, v
+        eor     v, v, #2
+        mov     one, #1
+        madd    e1, t1, v, one
+        mul     e2, e1, e1
+        madd    v, e1, v, v
+        mul     e4, e2, e2
+        madd    v, e2, v, v
+        mul     e8, e4, e4
+        madd    v, e4, v, v
+        madd    v, e8, v, v
+
+// Set up the outer loop count of 128 * k
+// The invariant is that m * n < 2^t at all times.
+
+        lsl     t, k, #7
+
+// Start of the main outer loop iterated t / CHUNKSIZE times
+
+bignum_modinv_outerloop:
+
+// We need only bother with sharper l = min k (ceil(t/64)) digits
+// for the computations on m and n (but we still need k for w and z).
+// Either both m and n fit in l digits, or m has become zero and so
+// nothing happens in the loop anyway and this makes no difference.
+
+        add     i, t, #63
+        lsr     l, i, #6
+        cmp     l, k
+        csel    l, k, l, cs
+
+// Select upper and lower proxies for both m and n to drive the inner
+// loop. The lower proxies are simply the lowest digits themselves,
+// m_lo = m[0] and n_lo = n[0], while the upper proxies are bitfields
+// of the two inputs selected so their top bit (63) aligns with the
+// most significant bit of *either* of the two inputs.
+
+        mov     h1, xzr // Previous high and low for m
+        mov     l1, xzr
+        mov     h2, xzr // Previous high and low for n
+        mov     l2, xzr
+        mov     c2, xzr // Mask flag: previous word of one was nonzero
+        // and in this case h1 and h2 are those words
+        mov     i, xzr
+bignum_modinv_toploop:
+        ldr     t1, [m, i, lsl #3]
+        ldr     t2, [n, i, lsl #3]
+        orr     c1, t1, t2
+        cmp     c1, xzr
+        and     c1, c2, h1
+        csel    l1, c1, l1, ne
+        and     c1, c2, h2
+        csel    l2, c1, l2, ne
+        csel    h1, t1, h1, ne
+        csel    h2, t2, h2, ne
+        csetm   c2, ne
+        add     i, i, #1
+        cmp     i, l
+        bcc     bignum_modinv_toploop
+
+        orr     t1, h1, h2
+        clz     t2, t1
+        negs    c1, t2
+        lsl     h1, h1, t2
+        csel    l1, l1, xzr, ne
+        lsl     h2, h2, t2
+        csel    l2, l2, xzr, ne
+        lsr     l1, l1, c1
+        lsr     l2, l2, c1
+        orr     m_hi, h1, l1
+        orr     n_hi, h2, l2
+
+        ldr     m_lo, [m]
+        ldr     n_lo, [n]
+
+// Now the inner loop, with i as loop counter from CHUNKSIZE down.
+// This records a matrix of updates to apply to the initial
+// values of m and n with, at stage j:
+//
+//     sgn * m' = (m_m * m - m_n * n) / 2^j
+//    -sgn * n' = (n_m * m - n_n * n) / 2^j
+//
+// where "sgn" is either +1 or -1, and we lose track of which except
+// that both instance above are the same. This throwing away the sign
+// costs nothing (since we have to correct in general anyway because
+// of the proxied comparison) and makes things a bit simpler. But it
+// is simply the parity of the number of times the first condition,
+// used as the swapping criterion, fires in this loop.
+
+        mov     m_m, #1
+        mov     m_n, xzr
+        mov     n_m, xzr
+        mov     n_n, #1
+
+        mov     i, #CHUNKSIZE
+
+// Conceptually in the inner loop we follow these steps:
+//
+// * If m_lo is odd and m_hi < n_hi, then swap the four pairs
+//    (m_hi,n_hi); (m_lo,n_lo); (m_m,n_m); (m_n,n_n)
+//
+// * Now, if m_lo is odd (old or new, doesn't matter as initial n_lo is odd)
+//    m_hi := m_hi - n_hi, m_lo := m_lo - n_lo
+//    m_m  := m_m + n_m, m_n := m_n + n_n
+//
+// * Halve and double them
+//     m_hi := m_hi / 2, m_lo := m_lo / 2
+//     n_m := n_m * 2, n_n := n_n * 2
+//
+// The actual computation computes updates before actually swapping and
+// then corrects as needed. It also maintains the invariant ~ZF <=> odd(m_lo),
+// since it seems to reduce the dependent latency. Set that up first.
+
+        ands    xzr, m_lo, #1
+
+bignum_modinv_innerloop:
+
+// At the start of the loop ~ZF <=> m_lo is odd; mask values accordingly
+// Set the flags for m_hi - [~ZF] * n_hi so we know to flip things.
+
+        csel    t1, n_hi, xzr, ne
+        csel    t2, n_lo, xzr, ne
+        csel    c1, n_m, xzr, ne
+        csel    c2, n_n, xzr, ne
+        ccmp    m_hi, n_hi, #0x2, ne
+
+// Compute subtractive updates, trivial in the case ZF <=> even(m_lo).
+
+        sub     t1, m_hi, t1
+        sub     t2, m_lo, t2
+
+// If the subtraction borrows, swap things appropriately, negating where
+// we've already subtracted so things are as if we actually swapped first.
+
+        csel    n_hi, n_hi, m_hi, cs
+        cneg    t1, t1, cc
+        csel    n_lo, n_lo, m_lo, cs
+        cneg    m_lo, t2, cc
+        csel    n_m, n_m, m_m, cs
+        csel    n_n, n_n, m_n, cs
+
+// Update and shift while setting oddness flag for next iteration
+// We look at bit 1 of t2 (m_lo before possible negation), which is
+// safe because it is even.
+
+        ands    xzr, t2, #2
+        add     m_m, m_m, c1
+        add     m_n, m_n, c2
+        lsr     m_hi, t1, #1
+        lsr     m_lo, m_lo, #1
+        add     n_m, n_m, n_m
+        add     n_n, n_n, n_n
+
+// Next iteration; don't disturb the flags since they are used at entry
+
+        sub     i, i, #1
+        cbnz    i, bignum_modinv_innerloop
+
+// Apply the update to w and z, using addition in this case, and also take
+// the chance to shift an additional 6 = 64-CHUNKSIZE bits to be ready for a
+// Montgomery multiplication. Because we know that m_m + m_n <= 2^58 and
+// w, z <= b < 2^{64k}, we know that both of these fit in k+1 words.
+// We do this before the m-n update to allow us to play with c1 and c2 here.
+//
+//    h1::w = 2^6 * (m_m * w + m_n * z)
+//    h2::z = 2^6 * (n_m * w + n_n * z)
+//
+// with c1 and c2 recording previous words for the shifting part
+
+        mov     h1, xzr
+        mov     h2, xzr
+        mov     c1, xzr
+        mov     c2, xzr
+
+        mov     i, xzr
+bignum_modinv_congloop:
+        ldr     t1, [w, i, lsl #3]
+        ldr     t2, [z, i, lsl #3]
+
+        mul     l1, m_m, t1
+        mul     l2, m_n, t2
+        adds    l1, l1, h1
+        umulh   h1, m_m, t1
+        adc     h1, h1, xzr
+        adds    l1, l1, l2
+        extr    c1, l1, c1, #CHUNKSIZE
+        str     c1, [w, i, lsl #3]
+        mov     c1, l1
+        umulh   l1, m_n, t2
+        adc     h1, h1, l1
+
+        mul     l1, n_m, t1
+        mul     l2, n_n, t2
+        adds    l1, l1, h2
+        umulh   h2, n_m, t1
+        adc     h2, h2, xzr
+        adds    l1, l1, l2
+        extr    c2, l1, c2, #CHUNKSIZE
+        str     c2, [z, i, lsl #3]
+        mov     c2, l1
+        umulh   l1, n_n, t2
+        adc     h2, h2, l1
+
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_modinv_congloop
+
+        extr    h1, h1, c1, #CHUNKSIZE
+        extr    h2, h2, c2, #CHUNKSIZE
+
+// Do a Montgomery reduction of h1::w
+
+        ldr     t1, [w]
+        mul     c1, t1, v
+        ldr     t2, [b]
+        mul     l1, c1, t2
+        umulh   l2, c1, t2
+        adds    t1, t1, l1      // Will be zero but want the carry
+
+        mov     i, #1
+        sub     t1, k, #1
+        cbz     t1, bignum_modinv_wmontend
+bignum_modinv_wmontloop:
+        ldr     t1, [b, i, lsl #3]
+        ldr     t2, [w, i, lsl #3]
+        mul     l1, c1, t1
+        adcs    t2, t2, l2
+        umulh   l2, c1, t1
+        adc     l2, l2, xzr
+        adds    t2, t2, l1
+        sub     l1, i, #1
+        str     t2, [w, l1, lsl #3]
+        add     i, i, #1
+        sub     t1, i, k
+        cbnz    t1, bignum_modinv_wmontloop
+bignum_modinv_wmontend:
+        adcs    l2, l2, h1
+        adc     h1, xzr, xzr
+        sub     l1, i, #1
+        str     l2, [w, l1, lsl #3]
+
+        subs    i, xzr, xzr
+bignum_modinv_wcmploop:
+        ldr     t1, [w, i, lsl #3]
+        ldr     t2, [b, i, lsl #3]
+        sbcs    xzr, t1, t2
+        add     i, i, #1
+        sub     t1, i, k
+        cbnz    t1, bignum_modinv_wcmploop
+
+        sbcs    xzr, h1, xzr
+        csetm   h1, cs
+
+        subs    i, xzr, xzr
+bignum_modinv_wcorrloop:
+        ldr     t1, [w, i, lsl #3]
+        ldr     t2, [b, i, lsl #3]
+        and     t2, t2, h1
+        sbcs    t1, t1, t2
+        str     t1, [w, i, lsl #3]
+        add     i, i, #1
+        sub     t1, i, k
+        cbnz    t1, bignum_modinv_wcorrloop
+
+// Do a Montgomery reduction of h2::z
+
+        ldr     t1, [z]
+        mul     c1, t1, v
+        ldr     t2, [b]
+        mul     l1, c1, t2
+        umulh   l2, c1, t2
+        adds    t1, t1, l1      // Will be zero but want the carry
+
+        mov     i, #1
+        sub     t1, k, #1
+        cbz     t1, bignum_modinv_zmontend
+bignum_modinv_zmontloop:
+        ldr     t1, [b, i, lsl #3]
+        ldr     t2, [z, i, lsl #3]
+        mul     l1, c1, t1
+        adcs    t2, t2, l2
+        umulh   l2, c1, t1
+        adc     l2, l2, xzr
+        adds    t2, t2, l1
+        sub     l1, i, #1
+        str     t2, [z, l1, lsl #3]
+        add     i, i, #1
+        sub     t1, i, k
+        cbnz    t1, bignum_modinv_zmontloop
+bignum_modinv_zmontend:
+        adcs    l2, l2, h2
+        adc     h2, xzr, xzr
+        sub     l1, i, #1
+        str     l2, [z, l1, lsl #3]
+
+        subs    i, xzr, xzr
+bignum_modinv_zcmploop:
+        ldr     t1, [z, i, lsl #3]
+        ldr     t2, [b, i, lsl #3]
+        sbcs    xzr, t1, t2
+        add     i, i, #1
+        sub     t1, i, k
+        cbnz    t1, bignum_modinv_zcmploop
+
+        sbcs    xzr, h2, xzr
+        csetm   h2, cs
+
+        subs    i, xzr, xzr
+bignum_modinv_zcorrloop:
+        ldr     t1, [z, i, lsl #3]
+        ldr     t2, [b, i, lsl #3]
+        and     t2, t2, h2
+        sbcs    t1, t1, t2
+        str     t1, [z, i, lsl #3]
+        add     i, i, #1
+        sub     t1, i, k
+        cbnz    t1, bignum_modinv_zcorrloop
+
+// Now actually compute the updates to m and n corresponding to the matrix,
+// and correct the signs if they have gone negative. First we compute the
+// (k+1)-sized updates with the following invariant (here c1 and c2 are in
+// fact carry bitmasks, either 0 or -1):
+//
+//    c1::h1::m = m_m * m - m_n * n
+//    c2::h2::n = n_m * m - n_n * n
+
+        mov     h1, xzr
+        mov     h2, xzr
+        mov     c1, xzr
+        mov     c2, xzr
+        mov     i, xzr
+bignum_modinv_crossloop:
+        ldr     t1, [m, i, lsl #3]
+        ldr     t2, [n, i, lsl #3]
+
+        mul     l1, m_m, t1
+        mul     l2, m_n, t2
+        adds    l1, l1, h1
+        umulh   h1, m_m, t1
+        adc     h1, h1, xzr
+        subs    l1, l1, l2
+        str     l1, [m, i, lsl #3]
+        umulh   l1, m_n, t2
+        sub     c1, l1, c1
+        sbcs    h1, h1, c1
+        csetm   c1, cc
+
+        mul     l1, n_m, t1
+        mul     l2, n_n, t2
+        adds    l1, l1, h2
+        umulh   h2, n_m, t1
+        adc     h2, h2, xzr
+        subs    l1, l1, l2
+        str     l1, [n, i, lsl #3]
+        umulh   l1, n_n, t2
+        sub     c2, l1, c2
+        sbcs    h2, h2, c2
+        csetm   c2, cc
+
+        add     i, i, #1
+        cmp     i, l
+        bcc     bignum_modinv_crossloop
+
+// Write back m optionally negated and shifted right CHUNKSIZE bits
+
+        adds    xzr, c1, c1
+
+        ldr     l1, [m]
+        mov     i, xzr
+        sub     j, l, #1
+        cbz     j, bignum_modinv_negskip1
+
+bignum_modinv_negloop1:
+        add     t1, i, #8
+        ldr     t2, [m, t1]
+        extr    l1, t2, l1, #CHUNKSIZE
+        eor     l1, l1, c1
+        adcs    l1, l1, xzr
+        str     l1, [m, i]
+        mov     l1, t2
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_modinv_negloop1
+bignum_modinv_negskip1:
+        extr    l1, h1, l1, #CHUNKSIZE
+        eor     l1, l1, c1
+        adcs    l1, l1, xzr
+        str     l1, [m, i]
+
+// Write back n optionally negated and shifted right CHUNKSIZE bits
+
+        adds    xzr, c2, c2
+
+        ldr     l1, [n]
+        mov     i, xzr
+        sub     j, l, #1
+        cbz     j, bignum_modinv_negskip2
+bignum_modinv_negloop2:
+        add     t1, i, #8
+        ldr     t2, [n, t1]
+        extr    l1, t2, l1, #CHUNKSIZE
+        eor     l1, l1, c2
+        adcs    l1, l1, xzr
+        str     l1, [n, i]
+        mov     l1, t2
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_modinv_negloop2
+bignum_modinv_negskip2:
+        extr    l1, h2, l1, #CHUNKSIZE
+        eor     l1, l1, c2
+        adcs    l1, l1, xzr
+        str     l1, [n, i]
+
+// Finally, use the signs c1 and c2 to do optional modular negations of
+// w and z respectively, flipping c2 to make signs work. We don't make
+// any checks for zero values, but we certainly retain w <= b and z <= b.
+// This is enough for the Montgomery step in the next iteration to give
+// strict reduction w < b amd z < b, and anyway when we terminate we
+// could not have z = b since it violates the coprimality assumption for
+// in-scope cases.
+
+        mov     i, xzr
+        adds    xzr, c1, c1
+bignum_modinv_wfliploop:
+        ldr     t1, [b, i, lsl #3]
+        ldr     t2, [w, i, lsl #3]
+        and     t1, t1, c1
+        eor     t2, t2, c1
+        adcs    t1, t1, t2
+        str     t1, [w, i, lsl #3]
+        add     i, i, #1
+        sub     t1, i, k
+        cbnz    t1, bignum_modinv_wfliploop
+
+        mvn     c2, c2
+
+        mov     i, xzr
+        adds    xzr, c2, c2
+bignum_modinv_zfliploop:
+        ldr     t1, [b, i, lsl #3]
+        ldr     t2, [z, i, lsl #3]
+        and     t1, t1, c2
+        eor     t2, t2, c2
+        adcs    t1, t1, t2
+        str     t1, [z, i, lsl #3]
+        add     i, i, #1
+        sub     t1, i, k
+        cbnz    t1, bignum_modinv_zfliploop
+
+// End of main loop. We can stop if t' <= 0 since then m * n < 2^0, which
+// since n is odd and m and n are coprime (in the in-scope cases) means
+// m = 0, n = 1 and hence from the congruence invariant a * z == 1 (mod b).
+// Moreover we do in fact need to maintain strictly t > 0 in the main loop,
+// or the computation of the optimized digit bound l could collapse to 0.
+
+        subs    t, t, #CHUNKSIZE
+        bhi     bignum_modinv_outerloop
+
+bignum_modinv_end:
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modoptneg.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modoptneg.S
new file mode 100644
index 00000000000..2f383f1a29e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modoptneg.S
@@ -0,0 +1,78 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x
+// (if p zero), assuming x reduced
+// Inputs p, x[k], m[k]; output z[k]
+//
+//    extern void bignum_modoptneg
+//      (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m);
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = p, X3 = x, X4 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modoptneg)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modoptneg)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define p x2
+#define x x3
+#define m x4
+#define i x5
+#define a x6
+#define b x7
+
+
+S2N_BN_SYMBOL(bignum_modoptneg):
+
+// Do nothing if k = 0
+
+        cbz     k, bignum_modoptneg_end
+
+// Make an additional check for zero input, and force p to zero in this case.
+// This can be skipped if the input is known not to be zero a priori.
+
+        mov     i, xzr
+        mov     a, xzr
+bignum_modoptneg_cmploop:
+        ldr     b, [x, i, lsl #3]
+        orr     a, a, b
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_modoptneg_cmploop
+
+        cmp     a, xzr
+        csel    p, p, xzr, ne
+
+// Turn the input p into a strict bitmask
+
+        cmp     p, xzr
+        csetm   p, ne
+
+// Main loop
+
+        mov     i, xzr
+        adds    xzr, p, p
+bignum_modoptneg_mainloop:
+
+        ldr     a, [m, i, lsl #3]
+        ldr     b, [x, i, lsl #3]
+        and     a, a, p
+        eor     b, b, p
+        adcs    a, a, b
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     a, i, k
+        cbnz    a, bignum_modoptneg_mainloop
+
+bignum_modoptneg_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modsub.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modsub.S
new file mode 100644
index 00000000000..0af361be4fb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modsub.S
@@ -0,0 +1,69 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modsub
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = y, X4 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define y x3
+#define m x4
+#define i x5
+#define j x6
+#define a x7
+#define b x8
+#define c x9
+
+
+S2N_BN_SYMBOL(bignum_modsub):
+
+adds    j, k, xzr               // j = k and ZF = (k = 0)
+        beq     bignum_modsub_end                     // if k = 0 do nothing
+        subs    i, xzr, xzr             // i = 0 and CF = 1
+
+// Subtract z := x - y and record a mask for the carry x - y < 0
+
+bignum_modsub_subloop:
+        ldr     a, [x, i]
+        ldr     b, [y, i]
+        sbcs    a, a, b
+        str     a, [z, i]
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_modsub_subloop
+        csetm   c, cc
+
+// Now do a masked addition z := z + [c] * m
+
+        mov     j, k
+        adds    i, xzr, xzr
+bignum_modsub_addloop:
+        ldr     a, [z, i]
+        ldr     b, [m, i]
+        and     b, b, c
+        adcs    a, a, b
+        str     a, [z, i]
+        add     i, i, #8
+        sub     j, j, #1
+        cbnz    j, bignum_modsub_addloop
+
+bignum_modsub_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montifier.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montifier.S
new file mode 100644
index 00000000000..b82a65013c3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montifier.S
@@ -0,0 +1,457 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compute "montification" constant z := 2^{128k} mod m
+// Input m[k]; output z[k]; temporary buffer t[>=k]
+//
+//    extern void bignum_montifier
+//      (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+//
+// The last argument points to a temporary buffer t that should have size >= k.
+// This is called "montifier" because given any other k-digit number x,
+// whether or not it's reduced modulo m, it can be mapped to its Montgomery
+// representation (2^{64k} * x) mod m just by Montgomery multiplication by z.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = t
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montifier)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montifier)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define m x2
+#define t x3
+
+// Some variables
+// Modular inverse w is aliased to i, but we never use them together
+
+#define i x4
+#define w x4
+#define j x5
+#define h x6
+#define a x7
+#define l x8
+#define c x9
+#define b x10
+#define d x11
+
+// Some aliases for the values b and d
+
+#define r x10
+#define q x11
+
+
+S2N_BN_SYMBOL(bignum_montifier):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_montifier_end
+
+// Copy the input m into the temporary buffer t. The temporary register
+// c matters since we want it to hold the highest digit, ready for the
+// normalization phase.
+
+        mov     i, xzr
+bignum_montifier_copyinloop:
+        ldr     c, [m, i, lsl #3]
+        str     c, [t, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_montifier_copyinloop
+
+// Do a rather stupid but constant-time digit normalization, conditionally
+// shifting left (k-1) times based on whether the top word is zero.
+// With careful binary striding this could be O(k*log(k)) instead of O(k^2)
+// while still retaining the constant-time style.
+// The "cmp c, xzr" sets the zeroness predicate (ZF) for the entire inner loop
+
+        subs    i, k, #1
+        beq     bignum_montifier_normalized
+bignum_montifier_normloop:
+        mov     j, xzr
+        cmp     c, xzr
+        mov     a, xzr
+bignum_montifier_shufloop:
+        mov     c, a
+        ldr     a, [t, j, lsl #3]
+        csel    c, c, a, eq
+        str     c, [t, j, lsl #3]
+        add     j, j, #1
+        sub     d, j, k
+        cbnz    d, bignum_montifier_shufloop
+        subs    i, i, #1
+        bne     bignum_montifier_normloop
+
+// We now have the top digit nonzero, assuming the input was nonzero,
+// and as per the invariant of the loop above, c holds that digit. So
+// now just count c's leading zeros and shift t bitwise that many bits.
+
+bignum_montifier_normalized:
+        clz     c, c
+
+        mov     b, xzr
+        mov     i, xzr
+        ands    xzr, c, #63
+        csetm   l, ne
+        neg     d, c
+bignum_montifier_bitloop:
+        ldr     j, [t, i, lsl #3]
+        lsl     a, j, c
+        orr     a, a, b
+        lsr     b, j, d
+        and     b, b, l
+        str     a, [t, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_montifier_bitloop
+
+// Let h be the high word of n, which in all the in-scope cases is >= 2^63.
+// Now successively form q = 2^i div h and r = 2^i mod h as i goes from
+// 64 to 126. We avoid just using division out of constant-time concerns
+// (at the least we would need to fix up h = 0 for out-of-scope inputs) and
+// don't bother with Newton-Raphson, since this stupid simple loop doesn't
+// contribute much of the overall runtime at typical sizes.
+
+        sub     h, k, #1
+        ldr     h, [t, h, lsl #3]
+        mov     q, #1
+        neg     r, h
+        mov     i, #62
+bignum_montifier_estloop:
+        add     q, q, q
+        mov     a, h
+        sub     a, a, r
+        cmp     r, a    // CF <=> r >= h - r <=> 2 * r >= h
+        csetm   a, cs
+        sub     q, q, a
+        add     r, r, r
+        and     a, a, h
+        sub     r, r, a
+        subs    i, i, #1
+        bne     bignum_montifier_estloop
+
+// Strictly speaking the above loop doesn't quite give the true remainder
+// and quotient in the special case r = h = 2^63, so fix it up. We get
+// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is
+// supererogatory, because the main property of q used below still holds
+// in this case unless the initial m = 1, and then anyway the overall
+// specification (congruence modulo m) holds degenerately. But it seems
+// nicer to get a "true" quotient and remainder.
+
+        cmp     r, h
+        csinc   q, q, q, ne
+
+// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the
+// fixed-up case above: note that we never actually use the computed
+// value of r below and so didn't adjust it). And we can assume the ranges
+// q <= 2^63 and r < h < 2^64.
+//
+// The idea is to use q as a first quotient estimate for a remainder
+// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the
+// high and low parts h and l:
+//
+// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l)
+//                  = 2^{p+62} - (2^{p-64} * (q * h) + q * l)
+//                  = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l
+//                  = 2^{p-64} * r - q * l
+//
+// Note that 2^{p-64} * r < 2^{p-64} * h <= n
+// and also  q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n
+// so |diff| = |2^{p-64} * r - q * l| < n.
+//
+// If in fact diff >= 0 then it is already 2^{p+62} mod n.
+// otherwise diff + n is the right answer.
+//
+// To (maybe?) make the computation slightly easier we actually flip
+// the sign and compute d = q * n - 2^{p+62}. Then the answer is either
+// -d (when negative) or n - d; in either case we effectively negate d.
+// This negating tweak in fact spoils the result for cases where
+// 2^{p+62} mod n = 0, when we get n instead. However the only case
+// where this can happen is m = 1, when the whole spec holds trivially,
+// and actually the remainder of the logic below works anyway since
+// the latter part of the code only needs a congruence for the k-digit
+// result, not strict modular reduction (the doublings will maintain
+// the non-strict inequality).
+
+        mov     c, xzr
+        adds    i, xzr, xzr
+bignum_montifier_mulloop:
+        ldr     a, [t, i, lsl #3]
+        mul     l, q, a
+        adcs    l, l, c
+        umulh   c, q, a
+        str     l, [z, i, lsl #3]
+        add     i, i, #1
+        sub     a, i, k
+        cbnz    a, bignum_montifier_mulloop
+
+        adc     c, c, xzr
+        mov     a, #0x4000000000000000
+        subs    c, c, a
+        csetm   q, cs
+
+// Now do [c] * n - d for our final answer
+
+        subs    i, xzr, xzr
+bignum_montifier_remloop:
+        ldr     a, [t, i, lsl #3]
+        ldr     b, [z, i, lsl #3]
+        and     a, a, q
+        sbcs    a, a, b
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     a, i, k
+        cbnz    a, bignum_montifier_remloop
+
+// Now still need to do a couple of modular doublings to get us all the
+// way up to 2^{p+64} == r from the initial 2^{p+62} == r (mod n).
+
+        mov     c, xzr
+        subs    j, xzr, xzr
+bignum_montifier_dubloop1:
+        ldr     a, [z, j, lsl #3]
+        extr    c, a, c, #63
+        ldr     b, [t, j, lsl #3]
+        sbcs    c, c, b
+        str     c, [z, j, lsl #3]
+        mov     c, a
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_dubloop1
+        lsr     c, c, #63
+        sbc     c, c, xzr
+        adds    j, xzr, xzr
+bignum_montifier_corrloop1:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [t, j, lsl #3]
+        and     b, b, c
+        adcs    a, a, b
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_corrloop1
+
+// This is not exactly the same: we also copy output to t giving the
+// initialization t_1 = r == 2^{p+64} mod n for the main loop next.
+
+        mov     c, xzr
+        subs    j, xzr, xzr
+bignum_montifier_dubloop2:
+        ldr     a, [z, j, lsl #3]
+        extr    c, a, c, #63
+        ldr     b, [t, j, lsl #3]
+        sbcs    c, c, b
+        str     c, [z, j, lsl #3]
+        mov     c, a
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_dubloop2
+        lsr     c, c, #63
+        sbc     c, c, xzr
+        adds    j, xzr, xzr
+bignum_montifier_corrloop2:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [t, j, lsl #3]
+        and     b, b, c
+        adcs    a, a, b
+        str     a, [z, j, lsl #3]
+        str     a, [t, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_corrloop2
+
+// We then successively generate (k+1)-digit values satisfying
+// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish
+// initialization by zeroing h initially
+
+        mov     h, xzr
+
+// Then if t_i = 2^{p} * h + l
+// we have t_{i+1} == 2^64 * t_i
+//         = (2^{p+64} * h) + (2^64 * l)
+//        == r * h + l<<64
+// Do this 2*k more times so we end up == 2^{192*k+64}, one more than we want
+//
+// Writing B = 2^{64k}, the possible correction of adding r, which for
+// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r
+// would give the overall worst-case value minus q of
+// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r]
+// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required.
+//
+// This implementation makes the shift implicit by starting b with the
+// "previous" digit (initially 0) to offset things by 1.
+
+        add     i, k, k
+bignum_montifier_modloop:
+        mov     j, xzr
+        mov     b, xzr
+        adds    c, xzr, xzr
+bignum_montifier_cmaloop:
+        ldr     a, [z, j, lsl #3]
+        mul     l, h, a
+        adcs    b, b, c
+        umulh   c, h, a
+        adc     c, c, xzr
+        adds    l, b, l
+        ldr     b, [t, j, lsl #3]
+        str     l, [t, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_cmaloop
+
+        adcs    h, b, c
+
+        csetm   l, cs
+
+        adds    j, xzr, xzr
+bignum_montifier_oaloop:
+        ldr     a, [t, j, lsl #3]
+        ldr     b, [z, j, lsl #3]
+        and     b, b, l
+        adcs    a, a, b
+        str     a, [t, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_oaloop
+        adc     h, h, xzr
+
+        subs    i, i, #1
+        bne     bignum_montifier_modloop
+
+// Compute the negated modular inverse w (same register as i, not used again).
+
+        ldr     a, [m]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     l, #1
+        madd    c, a, w, l
+        mul     b, c, c
+        madd    w, c, w, w
+        mul     c, b, b
+        madd    w, b, w, w
+        mul     b, c, c
+        madd    w, c, w, w
+        madd    w, b, w, w
+
+// Now do one almost-Montgomery reduction w.r.t. the original m
+// which lops off one 2^64 from the congruence and, with the usual
+// almost-Montgomery correction, gets us back inside k digits for
+// the end result.
+
+        ldr     b, [t]
+        mul     d, b, w
+
+        mul     l, d, a
+        umulh   c, d, a
+        mov     j, #1
+        sub     a, k, #1
+        adds    xzr, b, l
+        cbz     a, bignum_montifier_amontend
+
+bignum_montifier_amontloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     b, [t, j, lsl #3]
+        mul     l, d, a
+        adcs    b, b, c
+        umulh   c, d, a
+        adc     c, c, xzr
+        adds    b, b, l
+        sub     a, j, #1
+        str     b, [t, a, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_amontloop
+bignum_montifier_amontend:
+        adcs    h, h, c
+        csetm   l, cs
+        sub     a, k, #1
+        str     h, [t, a, lsl #3]
+
+        subs    j, xzr, xzr
+bignum_montifier_osloop:
+        ldr     a, [t, j, lsl #3]
+        ldr     b, [m, j, lsl #3]
+        and     b, b, l
+        sbcs    a, a, b
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_osloop
+
+// So far, the code(basically a variant of bignum_amontifier) has produced
+// a k-digit value z == 2^{192k} (mod m), not necessarily fully reduced mod m.
+// We now do a short Montgomery reduction (similar to bignum_demont) so that
+// we achieve full reduction mod m while lopping 2^{64k} off the congruence.
+// We recycle h as the somewhat strangely-named outer loop counter.
+
+        mov     h, k
+
+bignum_montifier_montouterloop:
+        ldr     b, [z]
+        mul     d, b, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   c, d, a
+        mov     j, #1
+        sub     a, k, #1
+        adds    xzr, b, l
+        cbz     a, bignum_montifier_montend
+bignum_montifier_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     b, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    b, b, c
+        umulh   c, d, a
+        adc     c, c, xzr
+        adds    b, b, l
+        sub     a, j, #1
+        str     b, [z, a, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_montloop
+bignum_montifier_montend:
+        adc     c, c, xzr
+        sub     a, k, #1
+        str     c, [z, a, lsl #3]
+
+        subs    h, h, #1
+        bne     bignum_montifier_montouterloop
+
+// Now do a comparison of z with m to set a final correction mask
+// indicating that z >= m and so we need to subtract m.
+
+        subs    j, xzr, xzr
+bignum_montifier_cmploop:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [m, j, lsl #3]
+        sbcs    xzr, a, b
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_cmploop
+        csetm   h, cs
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        subs    j, xzr, xzr
+bignum_montifier_corrloop:
+        ldr     a, [z, j, lsl #3]
+        ldr     b, [m, j, lsl #3]
+        and     b, b, h
+        sbcs    a, a, b
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montifier_corrloop
+
+bignum_montifier_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montmul.S
new file mode 100644
index 00000000000..672885189b2
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montmul.S
@@ -0,0 +1,193 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^{64k}) mod m
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_montmul
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+//
+// Does z := (x * y / 2^{64k}) mod m, assuming x * y <= 2^{64k} * m, which is
+// guaranteed in particular if x < m, y < m initially (the "intended" case).
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = y, X4 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define y x3
+#define m x4
+
+// Negated modular inverse
+#define w x5
+// Top carry for k'th position
+#define c0 x6
+// Additional top carry for (k+1)'th position
+#define c1 x7
+// Outer loop counter
+#define i x8
+// Home for i'th digit or Montgomery multiplier
+#define d x9
+// Inner loop counter
+#define j x10
+
+#define h x11
+#define e x12
+#define l x13
+#define a x14
+
+// This is just a short-term temporary used in zero-test subtraction.
+// It's aliased to the same register as "a" which is always safe here.
+
+#define t x14
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+// These just use c0 and c1 again, which aren't initialized early on.
+
+#define one x6
+#define e1 x6
+#define e2 x7
+#define e4 x6
+#define e8 x7
+
+
+S2N_BN_SYMBOL(bignum_montmul):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_montmul_end
+
+// Compute word-level negated modular inverse w for m == m[0].
+// This is essentially the same as word_negmodinv.
+
+        ldr     a, [m]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     one, #1
+        madd    e1, a, w, one
+        mul     e2, e1, e1
+        madd    w, e1, w, w
+        mul     e4, e2, e2
+        madd    w, e2, w, w
+        mul     e8, e4, e4
+        madd    w, e4, w, w
+        madd    w, e8, w, w
+
+// Initialize the output c0::z to zero so we can then consistently add rows.
+// It would be a bit more efficient to special-case the zeroth row, but
+// this keeps the code slightly simpler.
+
+        mov     i, xzr
+bignum_montmul_zoop:
+        str     xzr, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_montmul_zoop
+        mov     c0, xzr
+
+// Outer loop pulling down digits d=x[i], multiplying by y and reducing
+
+        mov     i, xzr
+bignum_montmul_outerloop:
+
+// Multiply-add loop where we always have CF + previous high part h to add in
+// Note that in general we do need yet one more carry in this phase and hence
+// initialize c1 with the top carry.
+
+        ldr     d, [x, i, lsl #3]
+        mov     j, xzr
+        adds    h, xzr, xzr
+bignum_montmul_maddloop:
+        ldr     a, [y, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        str     e, [z, j, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_montmul_maddloop
+        adcs    c0, c0, h
+        adc     c1, xzr, xzr
+
+// Montgomery reduction loop, similar but offsetting writebacks
+
+        ldr     e, [z]
+        mul     d, e, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   h, d, a
+        adds    e, e, l         // Will be zero but want the carry
+        mov     j, #1
+        sub     t, k, #1
+        cbz     t, bignum_montmul_montend
+bignum_montmul_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        sub     l, j, #1
+        str     e, [z, l, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_montmul_montloop
+bignum_montmul_montend:
+        adcs    h, c0, h
+        adc     c0, c1, xzr
+        sub     l, j, #1
+        str     h, [z, l, lsl #3]
+
+// End of outer loop
+
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_montmul_outerloop
+
+// Now do a comparison of (c0::z) with (0::m) to set a final correction mask
+// indicating that (c0::z) >= m and so we need to subtract m.
+
+        subs    j, xzr, xzr
+bignum_montmul_cmploop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        sbcs    xzr, a, e
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_montmul_cmploop
+
+        sbcs    xzr, c0, xzr
+        csetm   c0, cs
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        subs    j, xzr, xzr
+bignum_montmul_corrloop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        and     e, e, c0
+        sbcs    a, a, e
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_montmul_corrloop
+
+bignum_montmul_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montredc.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montredc.S
new file mode 100644
index 00000000000..5b19e7a6acd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montredc.S
@@ -0,0 +1,194 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery reduce, z := (x' / 2^{64p}) MOD m
+// Inputs x[n], m[k], p; output z[k]
+//
+//    extern void bignum_montredc
+//     (uint64_t k, uint64_t *z,
+//      uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+//
+// Does a := (x' / 2^{64p}) mod m where x' = x if n <= p + k and in general
+// is the lowest (p+k) digits of x, assuming x' <= 2^{64p} * m. That is,
+// p-fold Montgomery reduction w.r.t. a k-digit modulus m giving a k-digit
+// answer.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = m, X5 = p
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montredc)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montredc)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define n x2
+#define x x3
+#define m x4
+#define p x5
+
+// Negated modular inverse
+#define w x6
+// Outer loop counter
+#define i x7
+// Inner loop counter
+#define j x8
+// Home for Montgomery multiplier
+#define d x9
+// Top carry for current window
+#define c x14
+
+#define h x10
+#define e x11
+#define l x12
+#define a x13
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+// These just use i and j again, which aren't used early on.
+
+#define one x7
+#define e1 x7
+#define e2 x8
+#define e4 x7
+#define e8 x8
+
+
+S2N_BN_SYMBOL(bignum_montredc):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_montredc_end
+
+// Compute word-level negated modular inverse w for m == m[0].
+// This is essentially the same as word_negmodinv.
+
+        ldr     a, [m]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     one, #1
+        madd    e1, a, w, one
+        mul     e2, e1, e1
+        madd    w, e1, w, w
+        mul     e4, e2, e2
+        madd    w, e2, w, w
+        mul     e8, e4, e4
+        madd    w, e4, w, w
+        madd    w, e8, w, w
+
+// Initialize z to the lowest k digits of the input, zero-padding if n < k.
+
+        cmp     n, k
+        csel    j, k, n, cs
+        mov     i, xzr
+        cbz     j, bignum_montredc_padloop
+bignum_montredc_copyloop:
+        ldr     a, [x, i, lsl #3]
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, j
+        bcc     bignum_montredc_copyloop
+
+        cmp     i, k
+        bcs     bignum_montredc_initialized
+
+bignum_montredc_padloop:
+        str     xzr, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_montredc_padloop
+
+bignum_montredc_initialized:
+        mov     c, xzr
+
+// Now if p = 0 we just need the corrective tail, and even that is
+// only needed for the case when the input is exactly the modulus,
+// to maintain the <= 2^64p * n precondition
+
+        cbz     p, bignum_montredc_corrective
+
+// Outer loop, just doing a standard Montgomery reduction on z
+
+        mov     i, xzr
+bignum_montredc_outerloop:
+
+        ldr     e, [z]
+        mul     d, e, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   h, d, a
+        adds    e, e, l         // Will be zero but want the carry
+        mov     j, #1
+        sub     a, k, #1
+        cbz     a, bignum_montredc_montend
+bignum_montredc_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        sub     l, j, #1
+        str     e, [z, l, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montredc_montloop
+bignum_montredc_montend:
+        adcs    h, h, c
+        adc     c, xzr, xzr
+        add     j, j, i
+        cmp     j, n
+        bcs     bignum_montredc_offtheend
+        ldr     a, [x, j, lsl #3]
+        adds    h, h, a
+        adc     c, c, xzr
+bignum_montredc_offtheend:
+        sub     j, k, #1
+        str     h, [z, j, lsl #3]
+
+// End of outer loop
+
+        add     i, i, #1
+        cmp     i, p
+        bcc     bignum_montredc_outerloop
+
+// Now do a comparison of (c::z) with (0::m) to set a final correction mask
+// indicating that (c::z) >= m and so we need to subtract m.
+
+bignum_montredc_corrective:
+
+        subs    j, xzr, xzr
+bignum_montredc_cmploop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        sbcs    xzr, a, e
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montredc_cmploop
+
+        sbcs    xzr, c, xzr
+        csetm   c, cs
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        subs    j, xzr, xzr
+bignum_montredc_corrloop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        and     e, e, c
+        sbcs    a, a, e
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     a, j, k
+        cbnz    a, bignum_montredc_corrloop
+
+bignum_montredc_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montsqr.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montsqr.S
new file mode 100644
index 00000000000..a7824964a83
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montsqr.S
@@ -0,0 +1,192 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^{64k}) mod m
+// Inputs x[k], m[k]; output z[k]
+//
+//    extern void bignum_montsqr
+//      (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+//
+// Does z := (x^2 / 2^{64k}) mod m, assuming x^2 <= 2^{64k} * m, which is
+// guaranteed in particular if x < m initially (the "intended" case).
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = m
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define m x3
+
+// Negated modular inverse
+#define w x4
+// Top carry for k'th position
+#define c0 x5
+// Additional top carry for (k+1)'th position
+#define c1 x6
+// Outer loop counter
+#define i x7
+// Home for i'th digit or Montgomery multiplier
+#define d x8
+// Inner loop counter
+#define j x9
+
+#define h x10
+#define e x11
+#define l x12
+#define a x13
+
+// This is just a short-term temporary used in zero-test subtraction.
+// It's aliased to the same register as "a" which is always safe here.
+
+#define t x13
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+// These just use c0 and c1 again, which aren't initialized early on.
+
+#define one x5
+#define e1 x5
+#define e2 x6
+#define e4 x5
+#define e8 x6
+
+
+S2N_BN_SYMBOL(bignum_montsqr):
+
+// If k = 0 the whole operation is trivial
+
+        cbz     k, bignum_montsqr_end
+
+// Compute word-level negated modular inverse w for m == m[0].
+// This is essentially the same as word_negmodinv.
+
+        ldr     a, [m]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     one, #1
+        madd    e1, a, w, one
+        mul     e2, e1, e1
+        madd    w, e1, w, w
+        mul     e4, e2, e2
+        madd    w, e2, w, w
+        mul     e8, e4, e4
+        madd    w, e4, w, w
+        madd    w, e8, w, w
+
+// Initialize the output c0::z to zero so we can then consistently add rows.
+// It would be a bit more efficient to special-case the zeroth row, but
+// this keeps the code slightly simpler.
+
+        mov     i, xzr
+bignum_montsqr_zoop:
+        str     xzr, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_montsqr_zoop
+        mov     c0, xzr
+
+// Outer loop pulling down digits d=x[i], multiplying by x and reducing
+
+        mov     i, xzr
+bignum_montsqr_outerloop:
+
+// Multiply-add loop where we always have CF + previous high part h to add in
+// Note that in general we do need yet one more carry in this phase and hence
+// initialize c1 with the top carry.
+
+        ldr     d, [x, i, lsl #3]
+        mov     j, xzr
+        adds    h, xzr, xzr
+bignum_montsqr_maddloop:
+        ldr     a, [x, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        str     e, [z, j, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_montsqr_maddloop
+        adcs    c0, c0, h
+        adc     c1, xzr, xzr
+
+// Montgomery reduction loop, similar but offsetting writebacks
+
+        ldr     e, [z]
+        mul     d, e, w
+        ldr     a, [m]
+        mul     l, d, a
+        umulh   h, d, a
+        adds    e, e, l         // Will be zero but want the carry
+        mov     j, #1
+        sub     t, k, #1
+        cbz     t, bignum_montsqr_montend
+bignum_montsqr_montloop:
+        ldr     a, [m, j, lsl #3]
+        ldr     e, [z, j, lsl #3]
+        mul     l, d, a
+        adcs    e, e, h
+        umulh   h, d, a
+        adc     h, h, xzr
+        adds    e, e, l
+        sub     l, j, #1
+        str     e, [z, l, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_montsqr_montloop
+bignum_montsqr_montend:
+        adcs    h, c0, h
+        adc     c0, c1, xzr
+        sub     l, j, #1
+        str     h, [z, l, lsl #3]
+
+// End of outer loop
+
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_montsqr_outerloop
+
+// Now do a comparison of (c0::z) with (0::m) to set a final correction mask
+// indicating that (c0::z) >= m and so we need to subtract m.
+
+        subs    j, xzr, xzr
+bignum_montsqr_cmploop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        sbcs    xzr, a, e
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_montsqr_cmploop
+
+        sbcs    xzr, c0, xzr
+        csetm   c0, cs
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        subs    j, xzr, xzr
+bignum_montsqr_corrloop:
+        ldr     a, [z, j, lsl #3]
+        ldr     e, [m, j, lsl #3]
+        and     e, e, c0
+        sbcs    a, a, e
+        str     a, [z, j, lsl #3]
+        add     j, j, #1
+        sub     t, j, k
+        cbnz    t, bignum_montsqr_corrloop
+
+bignum_montsqr_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/generic/bignum_mul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mul.S
similarity index 100%
rename from third_party/s2n-bignum/arm/generic/bignum_mul.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mul.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_muladd10.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_muladd10.S
new file mode 100644
index 00000000000..a1deef001a5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_muladd10.S
@@ -0,0 +1,62 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply bignum by 10 and add word: z := 10 * z + d
+// Inputs z[k], d; outputs function return (carry) and z[k]
+//
+//    extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d);
+//
+// Although typically the input d < 10, this is not actually required.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = d, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_muladd10)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_muladd10)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define d x2
+
+#define i x3
+#define h x4
+
+#define a x5
+#define l x5
+
+S2N_BN_SYMBOL(bignum_muladd10):
+
+// If k = 0 just return the input d as the carry (out of zero digits)
+
+        cbz     k, bignum_muladd10_end
+
+// Simple loop
+
+        mov     i, xzr
+bignum_muladd10_loop:
+        ldr     a, [z, i, lsl #3]
+        lsr     h, a, #61
+        add     l, a, a
+        add     h, h, h, lsr #2
+        adds    l, l, l, lsl #2
+        adc     h, h, xzr
+        adds    a, l, d
+        str     a, [z, i, lsl #3]
+        adc     d, h, xzr
+        add     i, i, 1
+        cmp     i, k
+        bcc     bignum_muladd10_loop
+
+// Return the final carry
+
+bignum_muladd10_end:
+        mov     x0, d
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux.S
new file mode 100644
index 00000000000..1b45f9fa200
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux.S
@@ -0,0 +1,50 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiplex/select z := x (if p nonzero) or z := y (if p zero)
+// Inputs p, x[k], y[k]; output z[k]
+//
+//    extern void bignum_mux
+//     (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y);
+//
+// It is assumed that all numbers x, y and z have the same size k digits.
+//
+// Standard ARM ABI: X0 = p, X1 = k, X2 = z, X3 = x, X4 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux)
+        .text
+        .balign 4
+
+#define b x0
+#define k x1
+#define z x2
+#define x x3
+#define y x4
+#define a x5
+
+
+S2N_BN_SYMBOL(bignum_mux):
+
+cbz     k, bignum_mux_end                  // if k = 0 skip the bignum_mux_loop
+        cmp     b, #0                    // Set condition codes b = 0
+
+// We've set cc's from b once and for all and can now re-use "b" as a temporary
+
+bignum_mux_loop:
+        sub     k, k, #1
+        ldr     a, [x, k, lsl #3]
+        ldr     b, [y, k, lsl #3]
+        csel    a, a, b, ne
+        str     a, [z, k, lsl #3]
+        cbnz    k, bignum_mux_loop
+
+bignum_mux_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux16.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux16.S
new file mode 100644
index 00000000000..c3999111346
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux16.S
@@ -0,0 +1,67 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Select element from 16-element table, z := xs[k*i]
+// Inputs xs[16*k], i; output z[k]
+//
+//    extern void bignum_mux16
+//     (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i);
+//
+// It is assumed that all numbers xs[16] and the target z have the same size k
+// The pointer xs is to a contiguous array of size 16, elements size-k bignums
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = xs, X3 = i
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux16)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define i x3
+
+#define a x4
+#define b x5
+#define j x6
+#define n x7
+
+
+S2N_BN_SYMBOL(bignum_mux16):
+
+// Copy size into decrementable counter, skip everything if k = 0
+
+        adds    n, k, xzr
+        beq     bignum_mux16_end
+
+// Multiply i by k so we can compare pointer offsets directly with it
+
+        mul     i, i, k
+
+bignum_mux16_loop:
+
+        ldr     a, [x]
+        mov     j, k
+        .rep 15
+        ldr     b, [x, j, lsl #3]
+        cmp     j, i
+        csel    a, b, a, eq
+        add     j, j, k
+        .endr
+        str     a, [z]
+
+        add     z, z, #8
+        add     x, x, #8
+        subs    n, n, #1
+        bne     bignum_mux16_loop
+
+bignum_mux16_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_negmodinv.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_negmodinv.S
new file mode 100644
index 00000000000..4772a3512db
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_negmodinv.S
@@ -0,0 +1,135 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negated modular inverse, z := (-1/x) mod 2^{64k}
+// Input x[k]; output z[k]
+//
+//    extern void bignum_negmodinv
+//     (uint64_t k, uint64_t *z, uint64_t *x);
+//
+// Assuming x is odd (otherwise nothing makes sense) the result satisfies
+//
+//       x * z + 1 == 0 (mod 2^{64 * k})
+//
+// but is not necessarily reduced mod x.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_negmodinv)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_negmodinv)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+
+#define w x3
+#define a x4
+#define m x5
+#define h x6
+#define l x7
+#define e x8
+#define i x9
+
+
+
+S2N_BN_SYMBOL(bignum_negmodinv):
+
+// If k = 0 do nothing
+
+        cbz     k, bignum_negmodinv_end
+
+// Compute word-level negated modular inverse w for x[0].
+
+        ldr     a, [x]
+        lsl     w, a, #2
+        sub     w, a, w
+        eor     w, w, #2
+        mov     h, #1
+        madd    h, a, w, h
+        mul     l, h, h
+        madd    w, h, w, w
+        mul     h, l, l
+        madd    w, l, w, w
+        mul     l, h, h
+        madd    w, h, w, w
+        madd    w, l, w, w
+
+// Write that as lowest word of the output, then if k = 1 we're finished
+
+        str     w, [z]
+        cmp     k, #1
+        beq     bignum_negmodinv_end
+
+// Otherwise compute and write the other digits (1..k-1) of w * x + 1.
+// Note that at this point CF was set by the comparison (subtraction) "k - 1".
+// Since k >= 2 if we got here, this subtraction didn't carry; allowing
+// for the inverted carry on ARM that means that CF is guaranteed to be set.
+// This allows us to ignore the nominal "a * w + 1" from adding the low
+// part of the product, since its only contribution is to set the carry
+// flag. Thus, we only calculate the high part of a * w explicitly.
+
+        umulh   h, a, w
+        mov     i, #1
+bignum_negmodinv_initloop:
+        ldr     a, [x, i, lsl #3]
+        mul     l, a, w
+        adcs    l, l, h
+        umulh   h, a, w
+        str     l, [z, i, lsl #3]
+        add     i, i, #1
+        sub     a, k, i
+        cbnz    a, bignum_negmodinv_initloop
+
+// For simpler indexing, z := z + 8 and k := k - 1 per outer iteration
+// Then we can use the same index for x and for z and effective size k.
+//
+// But we also offset k by 1 so the "real" size is k + 1, which is why the
+// test at the end of the inner loop is i < k <=> i' = i + 1 < k + 1.
+// This lets us avoid some special cases inside the loop at the cost
+// of needing the additional "finale" tail for the final iteration
+// since we do one outer loop iteration too few.
+
+        subs    k, k, #2
+        beq     bignum_negmodinv_finale
+
+bignum_negmodinv_outerloop:
+        add     z, z, #8
+        ldr     e, [z]
+        mul     m, e, w
+        str     m, [z]
+        ldr     a, [x]
+        umulh   h, a, m
+        subs    xzr, e, #1         // Effective carry from a * m + e
+        mov     i, #1
+bignum_negmodinv_innerloop:
+        ldr     a, [x, i, lsl #3]
+        ldr     e, [z, i, lsl #3]
+        mul     l, a, m
+        adcs    e, e, h
+        umulh   h, a, m
+        adc     h, h, xzr
+        adds    e, e, l
+        str     e, [z, i, lsl #3]
+        sub     a, i, k
+        add     i, i, #1
+        cbnz    a, bignum_negmodinv_innerloop
+
+        subs    k, k, #1
+        bne     bignum_negmodinv_outerloop
+
+bignum_negmodinv_finale:
+        ldr     e, [z, #8]
+        mul     m, e, w
+        str     m, [z, #8]
+
+bignum_negmodinv_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_nonzero.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_nonzero.S
new file mode 100644
index 00000000000..072018af3ef
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_nonzero.S
@@ -0,0 +1,44 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignum for nonzero-ness x =/= 0
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x);
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_nonzero)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_nonzero)
+        .text
+        .balign 4
+
+#define k x0
+#define x x1
+#define a x2
+#define c x3
+
+
+S2N_BN_SYMBOL(bignum_nonzero):
+
+mov     c, xzr                  // c will be or of the digits
+        cbz     k, bignum_nonzero_end                  // if k = 0 skip the bignum_nonzero_loop
+
+bignum_nonzero_loop:
+        sub     k, k, #1
+        ldr     a, [x, k, lsl #3]
+        orr     c, c, a
+        cbnz    k, bignum_nonzero_loop
+
+        cmp     c, xzr
+        cset    x0, ne
+
+bignum_nonzero_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_normalize.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_normalize.S
new file mode 100644
index 00000000000..403bb1935fb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_normalize.S
@@ -0,0 +1,108 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Normalize bignum in-place by shifting left till top bit is 1
+// Input z[k]; outputs function return (bits shifted left) and z[k]
+//
+//    extern uint64_t bignum_normalize (uint64_t k, uint64_t *z);
+//
+// Given a k-digit bignum z, this function shifts it left by its number of
+// leading zero bits, to give result with top bit 1, unless the input number
+// was 0. The return is the same as the output of bignum_clz, i.e. the number
+// of bits shifted (nominally 64 * k in the case of zero input).
+//
+// Standard ARM ABI: X0 = k, X1 = z, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_normalize)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_normalize)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+
+// This is the return value we accumulate
+
+#define r x2
+
+// Other variables
+
+#define a x3
+#define b x4
+#define c x5
+#define d x6
+#define i x7
+#define j x8
+#define l x9
+
+
+S2N_BN_SYMBOL(bignum_normalize):
+
+// If k = 0 the whole operation is trivial. Otherwise initialize
+// shift count r and top digit c, but then if k = 1 skip the digitwise part
+
+        subs    i, k, #1
+        bcc     bignum_normalize_end
+        ldr     c, [z, i, lsl #3]
+        mov     r, xzr
+        beq     bignum_normalize_bitpart
+
+// Do a rather stupid but constant-time digit normalization, conditionally
+// shifting left (k-1) times based on whether the top word is zero.
+// With careful binary striding this could be O(k*log(k)) instead of O(k^2)
+// while still retaining the constant-time style.
+
+bignum_normalize_normloop:
+        mov     j, xzr
+        cmp     c, xzr
+        cinc    r, r, eq
+        mov     a, xzr
+bignum_normalize_shufloop:
+        mov     c, a
+        ldr     a, [z, j, lsl #3]
+        csel    c, c, a, eq
+        str     c, [z, j, lsl #3]
+        add     j, j, #1
+        sub     d, j, k
+        cbnz    d, bignum_normalize_shufloop
+        subs    i, i, #1
+        bne     bignum_normalize_normloop
+
+// We now have the top digit nonzero, assuming the input was nonzero,
+// and as per the invariant of the loop above, c holds that digit. So
+// now just count c's leading zeros and shift z bitwise that many bits.
+
+bignum_normalize_bitpart:
+        lsl     r, r, #6
+        clz     c, c
+        add     r, r, c
+
+        mov     b, xzr
+        mov     i, xzr
+        ands    xzr, c, #63
+        csetm   l, ne
+        neg     d, c
+bignum_normalize_bitloop:
+        ldr     j, [z, i, lsl #3]
+        lsl     a, j, c
+        orr     a, a, b
+        lsr     b, j, d
+        and     b, b, l
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_normalize_bitloop
+
+// Return the final shift count
+
+        mov     x0, r
+
+bignum_normalize_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_odd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_odd.S
new file mode 100644
index 00000000000..54d24fd6a74
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_odd.S
@@ -0,0 +1,30 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignum for odd-ness
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_odd (uint64_t k, uint64_t *x);
+//
+// Standard ARM ABI: X0 = k, X1 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_odd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_odd)
+        .text
+        .balign 4
+
+S2N_BN_SYMBOL(bignum_odd):
+
+cbz     x0, bignum_odd_end                 // if k = 0, that's the return!
+        ldr     x0, [x1]
+        and     x0, x0, #1
+
+bignum_odd_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_of_word.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_of_word.S
new file mode 100644
index 00000000000..b355ce79eb7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_of_word.S
@@ -0,0 +1,45 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert single digit to bignum, z := n
+// Input n; output z[k]
+//
+//    extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n);
+//
+// Create a k-digit (digit=64 bits) bignum at z with value n (mod 2^k)
+// where n is a word. The "mod 2^k" only matters in the degenerate k = 0 case.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = n
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_of_word)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_of_word)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define n x2
+
+
+S2N_BN_SYMBOL(bignum_of_word):
+
+cbz     k, bignum_of_word_end                  // if k = 0 do nothing
+
+        str     n, [z]                  // Set zeroth word to n
+        subs    k, k, #1                 // k := k - 1
+        beq     bignum_of_word_end                     // and if that's 0, finish
+
+bignum_of_word_loop:
+        str     xzr, [z, k, lsl #3]
+        subs    k, k, #1
+        bne     bignum_of_word_loop
+
+bignum_of_word_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optadd.S
new file mode 100644
index 00000000000..f4821128f87
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optadd.S
@@ -0,0 +1,71 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally add, z := x + y (if p nonzero) or z := x (if p zero)
+// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_optadd
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+//
+// It is assumed that all numbers x, y and z have the same size k digits.
+// Returns carry-out as per usual addition, always 0 if p was zero.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = p, X4 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optadd)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define p x3
+#define y x4
+
+#define m x3
+#define a x5
+#define b x6
+#define i x7
+
+
+S2N_BN_SYMBOL(bignum_optadd):
+
+// if k = 0 do nothing. This is also the right top carry in X0
+
+        cbz     k, bignum_optadd_end
+
+// Convert p into a strict bitmask (same register in fact)
+
+        cmp     p, xzr
+        csetm   m, ne
+
+// Set i = 0 *and* make sure initial CF = 0
+
+        adds    i, xzr, xzr
+
+// Main loop
+
+bignum_optadd_loop:
+        ldr     a, [x, i]
+        ldr     b, [y, i]
+        and     b, b, m
+        adcs    a, a, b
+        str     a, [z, i]
+        add     i, i, #8
+        sub     k, k, #1
+        cbnz    k, bignum_optadd_loop
+
+// Return carry flag
+
+        adc     x0, xzr, xzr
+
+bignum_optadd_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optneg.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optneg.S
new file mode 100644
index 00000000000..e4507f67d6c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optneg.S
@@ -0,0 +1,70 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate, z := -x (if p nonzero) or z := x (if p zero)
+// Inputs p, x[k]; outputs function return (nonzero input) and z[k]
+//
+//    extern uint64_t bignum_optneg
+//     (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x);
+//
+// It is assumed that both numbers x and z have the same size k digits.
+// Returns a carry, which is equivalent to "x is nonzero".
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = p, X3 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define p x2
+#define x x3
+
+#define a x4
+#define i x5
+
+
+S2N_BN_SYMBOL(bignum_optneg):
+
+// if k = 0 do nothing. This also has the right top carry zero in x0
+
+        cbz     k, bignum_optneg_end
+
+// Convert p into a strict bitmask
+
+        cmp     p, xzr
+        csetm   p, ne
+
+// Generate an initial carry-in for the negating case only to add 1; this
+// is because we are actually going to do complements of the words of x
+
+        adds    xzr, p, p
+
+// Main loop
+        mov     i, xzr
+bignum_optneg_loop:
+        ldr     a, [x, i]
+        eor     a, a, p
+        adcs    a, a, xzr
+        str     a, [z, i]
+        add     i, i, #8
+        sub     k, k, #1
+        cbnz    k, bignum_optneg_loop
+
+// Return carry flag, fixing up inversion for negative case
+
+        adc     x0, xzr, xzr
+        neg     p, p
+        eor     x0, x0, p
+
+bignum_optneg_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/generic/bignum_optsub.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsub.S
similarity index 100%
rename from third_party/s2n-bignum/arm/generic/bignum_optsub.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsub.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsubadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsubadd.S
new file mode 100644
index 00000000000..0dcc10a39a5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsubadd.S
@@ -0,0 +1,86 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed
+// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_optsubadd
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+//
+// If p has top bit set (i.e. is negative as a signed int) return z := x - y
+// Else if p is nonzero (i.e. is positive as a signed int) return z := x + y
+// Otherwise (i.e. p is zero) return z := x
+//
+// Return in X0 = the top carry, which will be 0 or 1, and appropriate for
+// addition or subtraction respectively (and always zero for p = 0)
+//
+// 2^{64*k} * -carryout + z = x - y [for subtraction]
+// 2^{64*k} * carryout + z = x + y [for addition]
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = p, X4 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optsubadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optsubadd)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define x x2
+#define p x3
+#define y x4
+
+#define m x3
+#define q x5
+#define a x6
+#define b x7
+#define i x8
+
+
+S2N_BN_SYMBOL(bignum_optsubadd):
+
+// if k = 0 do nothing. This is also the right top carry in X0
+
+        cbz     k, bignum_optsubadd_end
+
+// Turn the input p into two bitmasks, m indicating to use the y input at
+// all (same register as p) and q indicating a sign-flip
+
+        cmp     p, xzr
+        csetm   m, ne
+        csetm   q, mi
+
+// Generate an initial carry-in for the negating case only to add 1; this
+// is because we are actually going to do complements of the words of y
+
+        adds    xzr, q, q
+
+// Main loop
+
+        mov     i, xzr
+bignum_optsubadd_loop:
+        ldr     b, [y, i]
+        eor     b, b, q
+        ldr     a, [x, i]
+        and     b, b, m
+        adcs    a, a, b
+        str     a, [z, i]
+        add     i, i, #8
+        sub     k, k, #1
+        cbnz    k, bignum_optsubadd_loop
+
+// Return carry flag, fixing up inversion for negative case
+
+        adc     x0, xzr, xzr
+        neg     q, q
+        eor     x0, x0, q
+
+bignum_optsubadd_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_pow2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_pow2.S
new file mode 100644
index 00000000000..4f647a55ebd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_pow2.S
@@ -0,0 +1,60 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return bignum of power of 2, z := 2^n
+// Input n; output z[k]
+//
+//    extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n);
+//
+// The result is as usual mod 2^{64*k}, so will be zero if n >= 64*k.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = n
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_pow2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_pow2)
+        .text
+        .balign 4
+
+#define k x0
+#define z x1
+#define n x2
+
+#define w x3
+#define i x4
+#define a x5
+
+
+S2N_BN_SYMBOL(bignum_pow2):
+
+// If k = 0 the result is trivially zero
+
+        cbz     k, bignum_pow2_end
+
+// Create the index n at which to write the nonzero word and the word w itself
+// Note that the ARM manual explicitly says that shift counts are taken modulo
+// the datasize, so we don't need to mask the lower 6 bits of n ourselves.
+
+        mov     w, #1
+        lsl     w, w, n
+        lsr     n, n, #6
+
+// Now in a constant-time fashion set the n'th word to w and others to zero
+
+        mov     i, xzr
+bignum_pow2_loop:
+        cmp     i, n
+        csel    a, w, xzr, eq
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, k
+        bcc     bignum_pow2_loop
+
+bignum_pow2_end:
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shl_small.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shl_small.S
new file mode 100644
index 00000000000..77cf097b006
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shl_small.S
@@ -0,0 +1,99 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Shift bignum left by c < 64 bits z := x * 2^c
+// Inputs x[n], c; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_shl_small
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+//
+// Does the "z := x << c" operation where x is n digits, result z is p.
+// The shift count c is masked to 6 bits so it actually uses c' = c mod 64.
+// The return value is the "next word" of a p+1 bit result, if n <= p.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = c, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_shl_small)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_shl_small)
+        .text
+        .balign 4
+
+#define p x0
+#define z x1
+#define n x2
+#define x x3
+#define c x4
+
+#define d x5
+#define a x6
+#define b x7
+#define m x8
+#define t x9
+#define i x10
+
+
+S2N_BN_SYMBOL(bignum_shl_small):
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output.
+
+        cmp     n, p
+        csel    n, p, n, cs
+
+// Initialize counter i and "previous word" carry b to zero
+// and skip main loop if n = 0
+
+        mov     b, xzr
+        mov     i, xzr
+        cbz     n, bignum_shl_small_tail
+
+// Set up a mask for nonzero shift and a negated version of the shift.
+// Note that all basic word-level shifts are predictably masked to 6 bits.
+
+        ands    xzr, c, #63
+        csetm   m, ne
+        neg     d, c
+
+// Now the main loop
+bignum_shl_small_loop:
+        ldr     t, [x, i, lsl #3]
+        lsl     a, t, c
+        orr     a, a, b
+        lsr     b, t, d
+        and     b, b, m
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, n
+        bcc     bignum_shl_small_loop
+
+// If we are at the end, finish, otherwise write carry word then zeros
+
+bignum_shl_small_tail:
+
+        cmp     i, p
+        bcs     bignum_shl_small_end
+        str     b, [z, i, lsl #3]
+        mov     b, xzr
+        add     i, i, #1
+        cmp     i, p
+        bcs     bignum_shl_small_end
+
+bignum_shl_small_tloop:
+        str     xzr, [z, i, lsl #3]
+        add     i, i, #1
+        cmp     i, p
+        bcc     bignum_shl_small_tloop
+
+// Return top word
+
+bignum_shl_small_end:
+
+        mov     x0, b
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shr_small.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shr_small.S
new file mode 100644
index 00000000000..8ddcad9a83b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shr_small.S
@@ -0,0 +1,90 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Shift bignum right by c < 64 bits z := floor(x / 2^c)
+// Inputs x[n], c; outputs function return (bits shifted out) and z[k]
+//
+//    extern uint64_t bignum_shr_small
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+//
+// Does the "z := x >> c" operation where x is n digits, result z is p.
+// The shift count c is masked to 6 bits so it actually uses c' = c mod 64.
+// The return value is the inout mod 2^c'.
+//
+// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = c, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_shr_small)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_shr_small)
+        .text
+        .balign 4
+
+#define p x0
+#define z x1
+#define n x2
+#define x x3
+#define c x4
+
+#define d x5
+#define a x6
+#define b x7
+#define m x8
+#define t x9
+
+
+S2N_BN_SYMBOL(bignum_shr_small):
+
+// Set default carry-in word to 0
+
+        mov     b, xzr
+
+// First, if p > n then pad output on the left with p-n zeros
+
+        cmp     n, p
+        bcs     bignum_shr_small_nopad
+bignum_shr_small_padloop:
+        sub     p, p, #1
+        str     xzr, [z, p, lsl #3]
+        cmp     n, p
+        bcc     bignum_shr_small_padloop
+
+// We now know that p <= n. If in fact p < n let carry word = x[p] instead of 0
+
+bignum_shr_small_nopad:
+        beq     bignum_shr_small_shiftstart
+        ldr     b, [x, p, lsl #3]
+bignum_shr_small_shiftstart:
+
+// Set up negated version of the shift and shift b in preparation.
+// Use a mask for nonzero shift to fake 64-bit left shift in zero case
+
+        neg     d, c
+        lsl     b, b, d
+        ands    xzr, c, #63
+        csetm   m, ne
+        and     b, b, m
+
+// Now the main loop
+
+        cbz     p, bignum_shr_small_end
+bignum_shr_small_loop:
+        sub     p, p, #1
+        ldr     t, [x, p, lsl #3]
+        lsr     a, t, c
+        orr     a, a, b
+        lsl     b, t, d
+        and     b, b, m
+        str     a, [z, p, lsl #3]
+        cbnz    p, bignum_shr_small_loop
+
+// Return top word, shifted back to be a modulus
+
+bignum_shr_small_end:
+        lsr     x0, b, d
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/generic/bignum_sqr.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sqr.S
similarity index 100%
rename from third_party/s2n-bignum/arm/generic/bignum_sqr.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sqr.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sub.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sub.S
new file mode 100644
index 00000000000..5e9e40c9550
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sub.S
@@ -0,0 +1,118 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract, z := x - y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+//
+//    extern uint64_t bignum_sub
+//     (uint64_t p, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the z := x - y operation, truncating modulo p words in general and
+// returning a top borrow (0 or 1) in the p'th place, only subtracting input
+// words below p (as well as m and n respectively) to get the diff and borrow.
+//
+// Standard ARM ABI: X0 = p, X1 = z, X2 = m, X3 = x, X4 = n, X5 = y, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub)
+        .text
+        .balign 4
+
+#define p x0
+#define z x1
+#define m x2
+#define x x3
+#define n x4
+#define y x5
+#define i x6
+#define a x7
+#define d x8
+
+
+S2N_BN_SYMBOL(bignum_sub):
+
+// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
+// we'll never need words past the p'th. Can now assume m <= p and n <= p.
+// Then compare the modified m and n and branch accordingly
+
+        cmp     m, p
+        csel    m, p, m, cs
+        cmp     n, p
+        csel    n, p, n, cs
+        cmp     m, n
+        bcc     bignum_sub_ylonger
+
+// The case where x is longer or of the same size (p >= m >= n)
+
+        sub     p, p, m
+        sub     m, m, n
+        subs    i, xzr, xzr
+        cbz     n, bignum_sub_xmainskip
+bignum_sub_xmainloop:
+        ldr     a, [x, i, lsl #3]
+        ldr     d, [y, i, lsl #3]
+        sbcs    a, a, d
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_sub_xmainloop
+bignum_sub_xmainskip:
+        cbz     m, bignum_sub_xtopskip
+bignum_sub_xtoploop:
+        ldr     a, [x, i, lsl #3]
+        sbcs    a, a, xzr
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_sub_xtoploop
+bignum_sub_xtopskip:
+        cbnz    p, bignum_sub_tails
+        cset    x0, cc
+        ret
+
+// The case where y is longer (p >= n > m)
+
+bignum_sub_ylonger:
+        sub     p, p, n
+        sub     n, n, m
+        subs    i, xzr, xzr
+        cbz     m, bignum_sub_ytoploop
+bignum_sub_ymainloop:
+        ldr     a, [x, i, lsl #3]
+        ldr     d, [y, i, lsl #3]
+        sbcs    a, a, d
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     m, m, #1
+        cbnz    m, bignum_sub_ymainloop
+bignum_sub_ytoploop:
+        ldr     a, [y, i, lsl #3]
+        sbcs    a, xzr, a
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        sub     n, n, #1
+        cbnz    n, bignum_sub_ytoploop
+bignum_sub_ytopskip:
+        cbnz    p, bignum_sub_tails
+        cset    x0, cc
+        ret
+
+// Adding a non-trivial tail, when p > max(m,n)
+
+bignum_sub_tails:
+        csetm   a, cc
+bignum_sub_tailloop:
+        str     a, [z, i, lsl #3]
+        add     i, i, #1
+        subs    p, p, #1
+        bne     bignum_sub_tailloop
+        neg     x0, a
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_bytereverse.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_bytereverse.S
new file mode 100644
index 00000000000..bb892b7b0bf
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_bytereverse.S
@@ -0,0 +1,39 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reverse the order of bytes in a 64-bit word
+//
+//    extern uint64_t word_bytereverse (uint64_t a);
+//
+// Standard ARM ABI: X0 = a, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_bytereverse)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_bytereverse)
+        .text
+        .balign 4
+
+S2N_BN_SYMBOL(word_bytereverse):
+
+        mov     x1, #0xFFFF0000FFFF0000
+        mov     x2, #0x0000FFFF0000FFFF
+        and     x1, x1, x0
+        and     x2, x2, x0
+        ror     x1, x1, #32
+        orr     x0, x1, x2
+
+        mov     x1, #0xFF00FF00FF00FF00
+        mov     x2, #0x00FF00FF00FF00FF
+        and     x1, x1, x0
+        and     x2, x2, x0
+        ror     x1, x1, #24
+        ror     x2, x2, #8
+        orr     x0, x1, x2
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_clz.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_clz.S
new file mode 100644
index 00000000000..f77eb03412f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_clz.S
@@ -0,0 +1,25 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count leading zero bits in a single word
+// Input a; output function return
+//
+//    extern uint64_t word_clz (uint64_t a);
+//
+// Standard ARM ABI: X0 = a, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_clz)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_clz)
+        .text
+        .balign 4
+
+S2N_BN_SYMBOL(word_clz):
+        clz     x0, x0
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_ctz.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_ctz.S
new file mode 100644
index 00000000000..2f7bcade862
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_ctz.S
@@ -0,0 +1,37 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count trailing zero bits in a single word
+// Input a; output function return
+//
+//    extern uint64_t word_ctz (uint64_t a);
+//
+// Standard ARM ABI: X0 = a, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_ctz)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_ctz)
+        .text
+        .balign 4
+
+S2N_BN_SYMBOL(word_ctz):
+
+// ARM doesn't have a direct word ctz instruction, so we emulate it via
+// ctz(w) = 64 - clz(~w & (w-1)). This is depending, for cases of the form
+// ctz(....1), on the behavior clz(0) = 64, which is guaranteed according
+// to the ARM manual.
+
+        mvn     x1, x0
+        sub     x0, x0, #1
+        and     x0, x0, x1
+        clz     x1, x0
+        mov     x0, #64
+        sub     x0, x0, x1
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_divstep59.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_divstep59.S
new file mode 100644
index 00000000000..a94c70cf150
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_divstep59.S
@@ -0,0 +1,323 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Perform 59 "divstep" iterations and return signed matrix of updates
+// Inputs d, f, g; output m[2][2] and function return (updated d)
+//
+// extern int64_t word_divstep59
+//  (int64_t m[2][2],int64_t d,uint64_t f,uint64_t g);
+//
+// Standard ARM ABI: X0 = m, X1 = d, X2 = f, X3 = g, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_divstep59)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_divstep59)
+        .text
+        .balign 4
+
+#define m x0
+#define d x1
+#define f x2
+#define g x3
+
+#define fuv x4
+#define grs x5
+#define t x6
+#define n x7
+
+#define m00 x8
+#define m01 x9
+#define m10 x10
+#define m11 x11
+
+#define n00 x12
+#define n01 x13
+#define n10 x14
+#define n11 x15
+
+S2N_BN_SYMBOL(word_divstep59):
+
+// Pack f and g into single registers with (negated) update matrices,
+// initially the identity matrix. The f_lo and g_lo are initially
+// the 20 lowest bits of f and g.
+//
+// fuv = f_lo - 2^41 * 1 - 2^62 * 0
+// grs = g_lo - 2^41 * 0 - 2^62 * 1
+
+        and     fuv, f, #0xFFFFF
+        orr     fuv, fuv, 0xFFFFFE0000000000
+
+        and     grs, g, #0xFFFFF
+        orr     grs, grs, 0xc000000000000000
+
+        tst     grs, #1
+
+// Now do 20 divsteps on that packed format.
+//
+// At the i'th iteration (starting at i = 0, ending at i = 20)
+// the intermediate packed values are of the form
+//
+// fuv = f_lo - 2^{41-i} * m00 - 2^{62-i} * m01
+// grs = g_lo - 2^{41-i} * m10 - 2^{62-i} * m11
+//
+// where the following matrix indicates the updates to apply
+// to the original (full-sized) f and g for those iterations.
+//
+// [m00 m01] * [f_0] = [f_i]
+// [m10 m11]   [g_0]   [g_i]
+
+.set i, 0
+.rep 20
+
+        csel    t, fuv, xzr, ne
+        ccmp    d, xzr, #8, ne
+
+        cneg    d, d, ge
+        cneg    t, t, ge
+        csel    fuv, grs, fuv, ge
+
+        add     grs, grs, t
+        add     d, d, #2
+
+.if (i< 19)
+        tst     grs, #2
+.endif
+        asr     grs, grs, #1
+
+.set i, (i+1)
+.endr
+
+// Extract the matrix entries, but keep them in negated form.
+
+        add     m00, fuv, #1048576
+        sbfx    m00, m00, #21, #21
+
+        mov     m11, #1048576
+        add     m11, m11, m11, lsl #21
+        add     m01, fuv, m11
+        asr     m01, m01, #42
+
+        add     m10, grs, #1048576
+        sbfx    m10, m10, #21, #21
+
+        add     m11, grs, m11
+        asr     m11, m11, #42
+
+// Compute updated f and g using the negated matrix entries;
+// this flips the signs of f and g but it doesn't matter.
+//
+//   f = (m00 * f + m01 * g) / 2^20
+//   g = (m10 * f + m11 * g) / 2^20
+//
+// Since we only need another 40 bits, we can do all of that
+// computation naively using (implicitly signed) 64-bit words.
+
+        mul     t, m00, f
+        mul     n, m01, g
+        mul     f, m10, f
+        mul     g, m11, g
+
+        add     fuv, t, n
+        add     grs, f, g
+
+        asr     f, fuv, #20
+        asr     g, grs, #20
+
+// Re-pack for 20 more rounds
+
+        and     fuv, f, #0xFFFFF
+        orr     fuv, fuv, 0xFFFFFE0000000000
+
+        and     grs, g, #0xFFFFF
+        orr     grs, grs, 0xc000000000000000
+
+        tst     grs, #1
+
+// Second block of 20 divsteps in the same style
+
+.set i, 0
+.rep 20
+
+        csel    t, fuv, xzr, ne
+        ccmp    d, xzr, #8, ne
+
+        cneg    d, d, ge
+        cneg    t, t, ge
+        csel    fuv, grs, fuv, ge
+
+        add     grs, grs, t
+        add     d, d, #2
+
+.if (i< 19)
+        tst     grs, #2
+.endif
+        asr     grs, grs, #1
+
+.set i, (i+1)
+.endr
+
+// Extract the next matrix entries, in negated form again
+
+        add     n00, fuv, #1048576
+        sbfx    n00, n00, #21, #21
+
+        mov     n11, #1048576
+        add     n11, n11, n11, lsl #21
+        add     n01, fuv, n11
+        asr     n01, n01, #42
+
+        add     n10, grs, #1048576
+        sbfx    n10, n10, #21, #21
+
+        add     n11, grs, n11
+        asr     n11, n11, #42
+
+// Compute updated f and g using the negated matrix entries,
+// and so again flipping (thus actually restoring) the signs.
+//
+//   f = (n00 * f + n01 * g) / 2^20
+//   g = (n10 * f + n11 * g) / 2^20
+
+        mul     t, n00, f
+        mul     n, n01, g
+        mul     f, n10, f
+        mul     g, n11, g
+
+        add     fuv, t, n
+        add     grs, f, g
+
+        asr     f, fuv, #20
+        asr     g, grs, #20
+
+// Re-pack for 19 more rounds
+
+        and     fuv, f, #0xFFFFF
+        orr     fuv, fuv, 0xFFFFFE0000000000
+
+        and     grs, g, #0xFFFFF
+        orr     grs, grs, 0xc000000000000000
+
+        tst     grs, #1
+
+// Split the last divsteps into two blocks of 10 and 9 to insert the matrix
+// multiplication in between them. The first ten iterations:
+
+.set i, 0
+.rep 10
+
+        csel    t, fuv, xzr, ne
+        ccmp    d, xzr, #8, ne
+
+        cneg    d, d, ge
+        cneg    t, t, ge
+        csel    fuv, grs, fuv, ge
+
+        add     grs, grs, t
+        add     d, d, #2
+
+        tst     grs, #2
+        asr     grs, grs, #1
+
+.set i, (i+1)
+.endr
+
+// Multiply the first two matrices.
+//
+// [m00  m01] = [n00  n01] * [m00  m01]
+// [m10  m11]   [n10  n11]   [m10  m11]
+//
+// The resulting matrix entries are:
+//
+//   m00' = n00 * m00 + n01 * m10
+//   m01' = n00 * m01 + n01 * m11
+//   m10' = n10 * m00 + n11 * m10
+//   m11' = n10 * m01 + n11 * m11
+
+        mul     f, n00, m00
+        mul     g, n00, m01
+        mul     t, n10, m00
+        mul     n, n10, m01
+
+        madd    m00, n01, m10, f
+        madd    m01, n01, m11, g
+        madd    m10, n11, m10, t
+        madd    m11, n11, m11, n
+
+// Now the final 9 divsteps
+
+.rep 9
+
+        csel    t, fuv, xzr, ne
+        ccmp    d, xzr, #8, ne
+
+        cneg    d, d, ge
+        cneg    t, t, ge
+        csel    fuv, grs, fuv, ge
+
+        add     grs, grs, t
+        add     d, d, #2
+
+.if (i< 18)
+        tst     grs, #2
+.endif
+        asr     grs, grs, #1
+
+.set i, (i+1)
+.endr
+
+// Extract the matrix entries from the final 19 divsteps
+
+        add     n00, fuv, #1048576
+        sbfx    n00, n00, #22, #21
+
+        mov     n11, #1048576
+        add     n11, n11, n11, lsl #21
+        add     n01, fuv, n11
+        asr     n01, n01, #43
+
+        add     n10, grs, #1048576
+        sbfx    n10, n10, #22, #21
+
+        add     n11, grs, n11
+        asr     n11, n11, #43
+
+// Multiply by this new matrix
+//
+// [m00  m01] = [n00  n01] * [m00  m01]
+// [m10  m11]   [n10  n11]   [m10  m11]
+//
+// The resulting matrix entries are:
+//
+//   m00' = n00 * m00 + n01 * m10
+//   m01' = n00 * m01 + n01 * m11
+//   m10' = n10 * m00 + n11 * m10
+//   m11' = n10 * m01 + n11 * m11
+//
+// Since we didn't negate the n matrix, all products are negated
+// here using "mneg" and "msub" in place of "mul" and "madd", so
+// we have the correct sign for the returned composite matrix.
+
+        mneg    f, n00, m00
+        mneg    g, n00, m01
+        mneg    fuv, n10, m00
+        mneg    grs, n10, m01
+
+        msub    m00, n01, m10, f
+        msub    m01, n01, m11, g
+        msub    m10, n11, m10, fuv
+        msub    m11, n11, m11, grs
+
+// Finally store back and return final d.
+
+        stp     m00, m01, [m]
+        stp     m10, m11, [m, #16]
+
+        mov     x0, d
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_max.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_max.S
new file mode 100644
index 00000000000..2f6a2ea5cac
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_max.S
@@ -0,0 +1,30 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return maximum of two unsigned 64-bit words
+// Inputs a, b; output function return
+//
+//    extern uint64_t word_max (uint64_t a, uint64_t b);
+//
+// Standard ARM ABI: X0 = a, X1 = b, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_max)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_max)
+        .text
+        .balign 4
+
+#define a x0
+#define b x1
+
+S2N_BN_SYMBOL(word_max):
+
+        cmp     a, b
+        csel    x0, a, b, cs
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_min.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_min.S
new file mode 100644
index 00000000000..774538a6b79
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_min.S
@@ -0,0 +1,30 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return minimum of two unsigned 64-bit words
+// Inputs a, b; output function return
+//
+//    extern uint64_t word_min (uint64_t a, uint64_t b);
+//
+// Standard ARM ABI: X0 = a, X1 = b, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_min)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_min)
+        .text
+        .balign 4
+
+#define a x0
+#define b x1
+
+S2N_BN_SYMBOL(word_min):
+
+        cmp     a, b
+        csel    x0, a, b, cc
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_negmodinv.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_negmodinv.S
new file mode 100644
index 00000000000..f44b3e885cb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_negmodinv.S
@@ -0,0 +1,78 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Single-word negated modular inverse (-1/a) mod 2^64
+// Input a; output function return
+//
+//    extern uint64_t word_negmodinv (uint64_t a);
+//
+// A 64-bit function that returns a negated multiplicative inverse mod 2^64
+// of its input, assuming that input is odd. Given odd input a, the result z
+// will satisfy a * z + 1 == 0 (mod 2^64), i.e. a 64-bit word multiplication
+// a * z will give -1.
+//
+// Standard ARM ABI: X0 = a, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_negmodinv)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_negmodinv)
+        .text
+        .balign 4
+
+// Use some more intuitive variable names but these in general are aliased
+// to each other so need care when interpreting. Overall we only use the
+// registers x0, x1 and x2.
+//
+// There does seem a slight efficiency advantage in putting e' = e^2
+// before the x' = x (1 + e) each time. That's the only reason for not
+// reversing those and hence being able to alias all the e values to the
+// same register.
+
+#define a x0
+#define x x1
+#define one x2
+
+#define e1 x2
+#define e2 x0
+#define e4 x2
+#define e8 x0
+
+S2N_BN_SYMBOL(word_negmodinv):
+
+// Initial magical 5-bit approximation x = (a - a<<2) xor 2
+
+        lsl     x, a, #2
+        sub     x, a, x
+        eor     x, x, #2
+
+// Get error e = a * x + 1 for subsequent correction steps
+
+        mov     one, #1
+        madd    e1, a, x, one
+
+// e2 = e^2, x' = x (1 + e) is good to 10 bits
+
+        mul     e2, e1, e1
+        madd    x, e1, x, x
+
+// e4 = e^4, x' = x (1 + e^2) is good to 20 bits
+
+        mul     e4, e2, e2
+        madd    x, e2, x, x
+
+// e8 = e^8, x' = x (1 + e^4) is good to 40 bits
+
+        mul     e8, e4, e4
+        madd    x, e4, x, x
+
+// Final x' = x (1 + e^8) is good to the 64-bit word size
+
+        madd    x0, e8, x, x
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_popcount.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_popcount.S
new file mode 100644
index 00000000000..04d5a3b2957
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_popcount.S
@@ -0,0 +1,41 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count number of set bits in a single 64-bit word (population count)
+// Input a; output function return
+//
+//    extern uint64_t word_popcount (uint64_t a);
+//
+// Standard ARM ABI: X0 = a, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_popcount)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_popcount)
+        .text
+        .balign 4
+
+// Very similar to the traditional algorithm, e.g. Hacker's Delight 5-2
+
+S2N_BN_SYMBOL(word_popcount):
+
+        and     x1, x0, #0xAAAAAAAAAAAAAAAA
+        sub     x0, x0, x1, lsr #1
+
+        bic     x1, x0, #0x3333333333333333
+        and     x0, x0, #0x3333333333333333
+        add     x0, x0, x1, lsr #2
+
+        add     x0, x0, x0, lsr #4
+        and     x0, x0, #0x0F0F0F0F0F0F0F0F
+
+        mov     x1, #0x101010101010101
+        mul     x0, x0, x1
+        lsr     x0, x0, #56
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_recip.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_recip.S
new file mode 100644
index 00000000000..f4fc72056d4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_recip.S
@@ -0,0 +1,119 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Single-word reciprocal, underestimate of floor(2^128 / a) - 2^64
+// Input a; output function return
+//
+//    extern uint64_t word_recip (uint64_t a);
+//
+// Given an input word "a" with its top bit set (i.e. 2^63 <= a < 2^64), the
+// result "x" is implicitly augmented with a leading 1 giving x' = 2^64 + x.
+// The result is x' = ceil(2^128 / a) - 1, which except for the single
+// special case a = 2^63 is the same thing as x' = floor(2^128 / a).
+//
+// Standard ARM ABI: X0 = a, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_recip)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_recip)
+        .text
+        .balign 4
+
+#define a x0
+#define x x1
+
+// Some of these are aliased for clarity
+
+#define b x2
+
+#define t x3
+#define l x3
+
+#define d x4
+#define h x4
+
+S2N_BN_SYMBOL(word_recip):
+
+// Scale the input down: b overestimates a/2^16 with b <= 2^48 and
+// x underestimates 2^64/b with b * x =~= 2^64, accurate to ~2 bits.
+
+        lsr     b, a, #16
+        eor     x, b, #0x1FFFFFFFFFFFF
+        add     b, b, #1
+        lsr     x, x, #32
+
+// Suppose x = 2^64/b * (1 - e). and get scaled error d = 2^64 * e
+
+        msub    d, b, x, xzr
+
+// Rescale to give c = 2^15 * e (so c <= 2^13) and compute
+// e + e^2 + e^3 + e^4 = (1 + e^2) (e + e^2)
+// = (2^30 + c^2) * (2^15 * c + c^2) / 2^60
+// and then x * (1 + e + e^2 + e^3 + e^4)
+// = (2^30 * x + x * (2^30 + c^2) * (2^30 * c + c^2) / 2^30) / 2^30
+
+        lsr     t, d, #49
+        mul     t, t, t
+        lsr     d, d, #34
+        add     d, t, d
+        orr     t, t, #0x40000000
+        mul     t, d, t
+        lsr     t, t, #30
+        lsl     d, x, #30
+        madd    x, x, t, d
+        lsr     x, x, #30
+
+// Now b * x =~= 2^64, accurate to ~10 bits.
+// Do a 64-bit Newton step, scaling up x by 16 bits in the process.
+
+        msub    d, b, x, xzr
+        lsr     d, d, #24
+        mul     d, d, x
+        lsl     x, x, #16
+        lsr     d, d, #24
+        add     x, x, d
+
+// Now b * x =~= 2^80, accurate to ~20 bits.
+// Do a 64-bit Newton step, scaling up x by 31 bits in the process
+
+        msub    d, b, x, xzr
+        lsr     d, d, #32
+        mul     d, d, x
+        lsl     x, x, #31
+        lsr     d, d, #17
+        add     x, x, d
+
+// Now a * x =~= 2^127, accurate to ~40 bits. Do a Newton step at full size.
+// Instead of literally negating the product (h,l) we complement bits in
+// the extracted bitfield, which is close enough and a bit faster.
+// At the end we also shift x one more bit left, losing the known-1 top bit
+// so that a * (2^64 + x) =~= 2^128.
+
+        mul     l, a, x
+        umulh   h, a, x
+        extr    l, h, l, #60
+        lsr     h, x, #33
+        mvn     l, l
+        mul     l, h, l
+        lsl     x, x, #1
+        lsr     l, l, #33
+        add     x, x, l
+
+// Test if (x' + 1) * a < 2^128 where x' = 2^64 + x, catching the special
+// case where x + 1 would wrap, corresponding to input a = 2^63.
+
+        adds    t, x, #1
+        cinv    t, t, eq
+        umulh   h, a, t
+        adds    h, h, a
+
+// Select either x or x + 1 accordingly as the final answer
+
+        csel    x0, x, t, cs
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/Makefile
new file mode 100644
index 00000000000..4489fbc1665
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/Makefile
@@ -0,0 +1,66 @@
+#############################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+#############################################################################
+
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
+# use a cross-assembling version so that the code can still be assembled
+# and the proofs checked against the object files (though you won't be able
+# to run code without additional emulation infrastructure). The aarch64
+# cross-assembling version can be installed manually by something like:
+#
+#  sudo apt-get install binutils-aarch64-linux-gnu
+
+UNAME_RESULT=$(shell uname -p)
+
+ifeq ($(UNAME_RESULT),aarch64)
+GAS=as
+else
+GAS=aarch64-linux-gnu-as
+endif
+
+# List of object files
+
+OBJ = bignum_add_p256.o \
+      bignum_bigendian_4.o \
+      bignum_cmul_p256.o \
+      bignum_deamont_p256.o \
+      bignum_demont_p256.o \
+      bignum_double_p256.o \
+      bignum_half_p256.o \
+      bignum_inv_p256.o \
+      bignum_littleendian_4.o \
+      bignum_mod_n256.o \
+      bignum_mod_n256_4.o \
+      bignum_mod_p256.o \
+      bignum_mod_p256_4.o \
+      bignum_montinv_p256.o \
+      bignum_montmul_p256.o \
+      bignum_montmul_p256_alt.o \
+      bignum_montsqr_p256.o \
+      bignum_montsqr_p256_alt.o \
+      bignum_mux_4.o \
+      bignum_neg_p256.o \
+      bignum_nonzero_4.o \
+      bignum_optneg_p256.o \
+      bignum_sub_p256.o \
+      bignum_tomont_p256.o \
+      bignum_triple_p256.o \
+      p256_montjadd.o \
+      p256_montjadd_alt.o \
+      p256_montjdouble.o \
+      p256_montjdouble_alt.o \
+      p256_montjmixadd.o \
+      p256_montjmixadd_alt.o \
+      p256_montjscalarmul.o \
+      p256_montjscalarmul_alt.o \
+      p256_scalarmul.o \
+      p256_scalarmul_alt.o \
+      p256_scalarmulbase.o \
+      p256_scalarmulbase_alt.o
+
+%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ -
+
+default: $(OBJ);
+
+clean:; rm -f *.o *.correct unopt/*.o
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_add_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_add_p256.S
new file mode 100644
index 00000000000..e739659cebd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_add_p256.S
@@ -0,0 +1,73 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_add_p256
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p256)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+#define c x3
+#define d0 x4
+#define d1 x5
+#define d2 x6
+#define d3 x7
+#define n0 x8
+#define n1 x9
+#define n2 x10
+#define n3 x11
+
+S2N_BN_SYMBOL(bignum_add_p256):
+
+// First just add the numbers as [c;d3;d2;d1;d0]
+
+        ldp     d0, d1, [x]
+        ldp     n0, n1, [y]
+        adds    d0, d0, n0
+        adcs    d1, d1, n1
+        ldp     d2, d3, [x, #16]
+        ldp     n2, n3, [y, #16]
+        adcs    d2, d2, n2
+        adcs    d3, d3, n3
+        adc     c, xzr, xzr
+
+// Now let [c;n3;n2;n1;n0] = [c;d3;d2;d1;d0] - p_256
+
+        subs    n0, d0, #0xffffffffffffffff
+        mov     n1, #0x00000000ffffffff
+        sbcs    n1, d1, n1
+        sbcs    n2, d2, xzr
+        mov     n3, #0xffffffff00000001
+        sbcs    n3, d3, n3
+        sbcs    c, c, xzr
+
+// Select result according to whether (x + y) - p_256 < 0
+
+        csel    d0, d0, n0, cc
+        csel    d1, d1, n1, cc
+        csel    d2, d2, n2, cc
+        csel    d3, d3, n3, cc
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_bigendian_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_bigendian_4.S
new file mode 100644
index 00000000000..c19c799048b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_bigendian_4.S
@@ -0,0 +1,136 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert 4-digit (256-bit) bignum to/from big-endian form
+// Input x[4]; output z[4]
+//
+//    extern void bignum_bigendian_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The same function is given two other prototypes whose names reflect the
+// treatment of one or other argument as a byte array rather than word array:
+//
+//    extern void bignum_frombebytes_4
+//     (uint64_t z[static 4], uint8_t x[static 32]);
+//
+//    extern void bignum_tobebytes_4
+//     (uint8_t z[static 32], uint64_t x[static 4]);
+//
+// The implementation works by loading in bytes and storing in words (i.e.
+// stylistically it is "frombebytes"); in the more common little-endian
+// usage of ARM, this is just byte reversal.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bigendian_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bigendian_4)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_frombebytes_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_frombebytes_4)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tobebytes_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tobebytes_4)
+
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d x2
+#define dshort w2
+#define a x3
+#define c x4
+
+// The reads and writes are organized in mirror-image pairs (0-3 and 1-2)
+// to allow x and z to point to the same buffer without using more
+// intermediate registers.
+
+S2N_BN_SYMBOL(bignum_bigendian_4):
+S2N_BN_SYMBOL(bignum_frombebytes_4):
+S2N_BN_SYMBOL(bignum_tobebytes_4):
+
+// 0 and 3 words
+
+        ldrb    dshort, [x, #7]
+        extr    a, d, xzr, #8
+        ldrb    dshort, [x, #6]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #5]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #4]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #3]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #2]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #1]
+        extr    a, d, a, #8
+        ldrb    dshort, [x]
+        extr    a, d, a, #8
+
+        ldrb    dshort, [x, #31]
+        extr    c, d, xzr, #8
+        ldrb    dshort, [x, #30]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #29]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #28]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #27]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #26]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #25]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #24]
+        extr    c, d, c, #8
+
+        str     a, [z, #24]
+        str     c, [z]
+
+// 1 and 2 words
+
+        ldrb    dshort, [x, #15]
+        extr    a, d, xzr, #8
+        ldrb    dshort, [x, #14]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #13]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #12]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #11]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #10]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #9]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #8]
+        extr    a, d, a, #8
+
+        ldrb    dshort, [x, #23]
+        extr    c, d, xzr, #8
+        ldrb    dshort, [x, #22]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #21]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #20]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #19]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #18]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #17]
+        extr    c, d, c, #8
+        ldrb    dshort, [x, #16]
+        extr    c, d, c, #8
+
+        str     a, [z, #16]
+        str     c, [z, #8]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_cmul_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_cmul_p256.S
new file mode 100644
index 00000000000..e0c231f7bf7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_cmul_p256.S
@@ -0,0 +1,131 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_p256
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = c, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define m x1
+#define x x2
+
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define h x7
+#define c x8
+#define a0 x9
+#define a1 x10
+#define a2 x11
+
+// These are the same!
+#define a3 x12
+#define q x12
+
+
+S2N_BN_SYMBOL(bignum_cmul_p256):
+
+S2N_BN_SYMBOL(bignum_cmul_p256_alt):
+
+// First do the multiply, straightforwardly to [d;d3;d2;d1;d0]
+
+        ldp     a0, a1, [x]
+        ldp     a2, a3, [x, #16]
+        mul     d0, m, a0
+        mul     d1, m, a1
+        mul     d2, m, a2
+        mul     d3, m, a3
+        umulh   a0, m, a0
+        umulh   a1, m, a1
+        umulh   a2, m, a2
+        umulh   h, m, a3
+        adds    d1, d1, a0
+        adcs    d2, d2, a1
+        adcs    d3, d3, a2
+        adcs    h, h, xzr
+
+// Writing the product as z = 2^256 * h + 2^192 * l + t = 2^192 * hl + t, our
+// intended quotient approximation is (hl + hl>>32 + 1)>>64. Note that by
+// hypothesis our product is <= (2^64 - 1) * (p_256 - 1), so there is no need
+// to max this out to avoid wrapping, unlike in the more general case of
+// bignum_mod_p256.
+
+        subs    xzr, xzr, xzr   // set carry flag for +1
+        extr    q, h, d3, #32
+        adcs    xzr, d3, q
+        lsr     q, h, #32
+        adcs    q, h, q
+
+// It's easy to see -p_256 <= z - q * p_256 < p_256, so we just need to
+// subtract q * p_256 and then correct if that is negative by adding p_256.
+// We want z - q * p_256
+//       = (z + 2^224 * q) - (2^256 + 2^192 + 2^96 - 1) * q
+//
+// We just do that computation in 5 words, freely ignoring the carry,
+// since we have plenty to make our later decision just based on one bit,
+// so one extra word is ample.
+
+// First do [a2;a1] = 2^32 * q, which we use twice
+
+        lsl     a1, q, #32
+        lsr     a2, q, #32
+
+// Add that to hl, hence including the 2^224 * q part
+
+        adds    d3, d3, a1
+        adc     h, h, a2
+
+// Now accumulate [a2;a1;a0] = (2^96 - 1) * q.
+// Remember q might be zero so we truly need a (short) carry chain here.
+
+        subs    a0, xzr, q
+        sbcs    a1, a1, xzr
+        sbc     a2, a2, xzr
+
+// Hence load remaining digits and do the subtraction
+
+        subs    d0, d0, a0
+        sbcs    d1, d1, a1
+        sbcs    d2, d2, a2
+        sbcs    d3, d3, q
+        sbcs    c, h, q
+
+// Now our top word is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative
+// So correct by adding masked p_256
+
+        adds    d0, d0, c
+        mov     h, #0x00000000ffffffff
+        and     h, h, c
+        adcs    d1, d1, h
+        adcs    d2, d2, xzr
+        mov     h, #0xffffffff00000001
+        and     h, h, c
+        adc     d3, d3, h
+
+// Finally store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_deamont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_deamont_p256.S
new file mode 100644
index 00000000000..783be66f845
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_deamont_p256.S
@@ -0,0 +1,115 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_deamont_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form,
+// "almost" meaning any 4-digit input will work, with no range restriction.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256_alt)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1] and generating d4 from zero, re-using
+// d0 as a temporary internally together with t0, t1 and t2.
+// It is fine for d4 to be the same register as d0, and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t2,t1,t0)                                  \
+/* Let w = d0, the original word we use as offset; d0 gets recycled      */ \
+/* First let [t2;t1] = 2^32 * w                                          */ \
+/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0)           */ \
+        lsl     t1, d0, #32 __LF                                       \
+        subs    t0, d0, t1 __LF                                        \
+        lsr     t2, d0, #32 __LF                                       \
+        sbc     d0, d0, t2 __LF                                        \
+/* Hence [d4;..;d1] := [d3;d2;d1;0] + (2^256 - 2^224 + 2^192 + 2^96) * w */ \
+        adds    d1, d1, t1 __LF                                        \
+        adcs    d2, d2, t2 __LF                                        \
+        adcs    d3, d3, t0 __LF                                        \
+        adc     d4, d0, xzr
+
+// Input parameters
+
+#define z x0
+#define x x1
+
+// Rotating registers for the intermediate windows (with repetitions)
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+
+// Other temporaries
+
+#define u x6
+#define v x7
+#define w x8
+
+S2N_BN_SYMBOL(bignum_deamont_p256):
+
+S2N_BN_SYMBOL(bignum_deamont_p256_alt):
+
+// Set up an initial window with the input x and an extra leading zero
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Systematically scroll left doing 1-step reductions
+
+        montreds(d0,d3,d2,d1,d0, u,v,w)
+
+        montreds(d1,d0,d3,d2,d1, u,v,w)
+
+        montreds(d2,d1,d0,d3,d2, u,v,w)
+
+        montreds(d3,d2,d1,d0,d3, u,v,w)
+
+// Now compare end result in [d3;d2;d1;d0] with p_256 = [w; 0; v; -1]
+
+        mov     v, #0x00000000ffffffff
+        mov     w, #0xffffffff00000001
+
+        subs    xzr, d0, #-1
+        sbcs    xzr, d1, v
+        sbcs    xzr, d2, xzr
+        sbcs    xzr, d3, w
+
+// Convert the condition [d3;d2;d1;d0] >= p_256 into a bitmask
+// and do a masked subtraction
+
+        csetm   u, cs
+
+        subs    d0, d0, u
+        and     v, v, u
+        sbcs    d1, d1, v
+        sbcs    d2, d2, xzr
+        and     w, w, u
+        sbc     d3, d3, w
+
+// Write back result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_demont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_demont_p256.S
new file mode 100644
index 00000000000..36ea7ec2a9a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_demont_p256.S
@@ -0,0 +1,93 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_demont_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// This assumes the input is < p_256 for correctness. If this is not the case,
+// use the variant "bignum_deamont_p256" instead.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256_alt)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1] and generating d4 from zero, re-using
+// d0 as a temporary internally together with t0, t1 and t2.
+// It is fine for d4 to be the same register as d0, and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t2,t1,t0)                                  \
+/* Let w = d0, the original word we use as offset; d0 gets recycled      */ \
+/* First let [t2;t1] = 2^32 * w                                          */ \
+/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0)           */ \
+        lsl     t1, d0, #32 __LF                                       \
+        subs    t0, d0, t1 __LF                                        \
+        lsr     t2, d0, #32 __LF                                       \
+        sbc     d0, d0, t2 __LF                                        \
+/* Hence [d4;..;d1] := [d3;d2;d1;0] + (2^256 - 2^224 + 2^192 + 2^96) * w */ \
+        adds    d1, d1, t1 __LF                                        \
+        adcs    d2, d2, t2 __LF                                        \
+        adcs    d3, d3, t0 __LF                                        \
+        adc     d4, d0, xzr
+
+// Input parameters
+
+#define z x0
+#define x x1
+
+// Rotating registers for the intermediate windows (with repetitions)
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+
+// Other temporaries
+
+#define u x6
+#define v x7
+#define w x8
+
+S2N_BN_SYMBOL(bignum_demont_p256):
+
+S2N_BN_SYMBOL(bignum_demont_p256_alt):
+
+// Set up an initial window with the input x and an extra leading zero
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Systematically scroll left doing 1-step reductions
+
+        montreds(d0,d3,d2,d1,d0, u,v,w)
+
+        montreds(d1,d0,d3,d2,d1, u,v,w)
+
+        montreds(d2,d1,d0,d3,d2, u,v,w)
+
+        montreds(d3,d2,d1,d0,d3, u,v,w)
+
+// Write back result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_double_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_double_p256.S
new file mode 100644
index 00000000000..1d7e3460d08
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_double_p256.S
@@ -0,0 +1,73 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_double_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p256)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define c x6
+#define n0 x7
+#define n1 x8
+#define n2 x9
+#define n3 x10
+
+
+S2N_BN_SYMBOL(bignum_double_p256):
+
+// Double the input number as 2 * x = c + [d3; d2; d1; d0]
+// It's worth considering doing this with extr...63 instead
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+        adds    d0, d0, d0
+        adcs    d1, d1, d1
+        adcs    d2, d2, d2
+        adcs    d3, d3, d3
+        adc     c, xzr, xzr
+
+// Subtract p_256 to give 2 * x - p_256 = c + [n3; n2; n1; n0]
+
+        subs    n0, d0, #0xffffffffffffffff
+        mov     n1, #0x00000000ffffffff
+        sbcs    n1, d1, n1
+        sbcs    n2, d2, xzr
+        mov     n3, #0xffffffff00000001
+        sbcs    n3, d3, n3
+        sbcs    c, c, xzr
+
+// Now CF is set (because of inversion) if 2 * x >= p_256, in which case the
+// correct result is [n3; n2; n1; n0], otherwise [d3; d2; d1; d0]
+
+        csel    d0, d0, n0, cc
+        csel    d1, d1, n1, cc
+        csel    d2, d2, n2, cc
+        csel    d3, d3, n3, cc
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_half_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_half_p256.S
new file mode 100644
index 00000000000..e2612ac27cf
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_half_p256.S
@@ -0,0 +1,71 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_half_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_p256)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define d4 x6
+#define m x7
+#define n x8
+
+S2N_BN_SYMBOL(bignum_half_p256):
+
+// Load the 4 digits of x
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Get a bitmask corresponding to the lowest bit of the input
+
+        and     m, d0, #1
+        neg     m, m
+
+// Do a masked addition of p_256, catching carry in a 5th word
+
+        adds    d0, d0, m
+        and     n, m, #0x00000000ffffffff
+        adcs    d1, d1, n
+        adcs    d2, d2, xzr
+        and     n, m, #0xffffffff00000001
+        adcs    d3, d3, n
+        adc     d4, xzr, xzr
+
+// Now shift that sum right one place
+
+        extr    d0, d1, d0, #1
+        extr    d1, d2, d1, #1
+        extr    d2, d3, d2, #1
+        extr    d3, d4, d3, #1
+
+// Store back
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+// Return
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_inv_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_inv_p256.S
new file mode 100644
index 00000000000..489a9d5f6b2
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_inv_p256.S
@@ -0,0 +1,1274 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+//
+// extern void bignum_inv_p256(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// If the 4-digit input x is coprime to p_256, i.e. is not divisible
+// by it, returns z < p_256 such that x * z == 1 (mod p_256). Note that
+// x does not need to be reduced modulo p_256, but the output always is.
+// If the input is divisible (i.e. is 0 or p_256), then there can be no
+// modular inverse and z = 0 is returned.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p256)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Used for the return pointer
+
+#define res x20
+
+// Loop counter and d = 2 * delta value for divstep
+
+#define i x21
+#define d x22
+
+// Registers used for matrix element magnitudes and signs
+
+#define m00 x10
+#define m01 x11
+#define m10 x12
+#define m11 x13
+#define s00 x14
+#define s01 x15
+#define s10 x16
+#define s11 x17
+
+// Initial carries for combinations
+
+#define car0 x9
+#define car1 x19
+
+// Input and output, plain registers treated according to pattern
+
+#define reg0 x0, #0
+#define reg1 x1, #0
+#define reg2 x2, #0
+#define reg3 x3, #0
+#define reg4 x4, #0
+
+#define x x1, #0
+#define z x0, #0
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f sp, #0
+#define g sp, #(6*N)
+#define u sp, #(12*N)
+#define v sp, #(16*N)
+
+// Total size to reserve on the stack
+
+#define NSPACE #(20*N)
+
+// ---------------------------------------------------------------------------
+// Core signed almost-Montgomery reduction macro. Takes input in
+// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to
+// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally
+// as well as t0, t1, t2. This is almost-Montgomery, i.e. the result fits
+// in 4 digits but is not necessarily strictly reduced mod p_256.
+// ---------------------------------------------------------------------------
+
+#define amontred(d4,d3,d2,d1,d0, t2,t1,t0)                                  \
+/* We only know the input is -2^316 < x < 2^316. To do traditional  */      \
+/* unsigned Montgomery reduction, start by adding 2^61 * p_256.     */      \
+        mov     t0, #0xe000000000000000 __LF                           \
+        adds    d0, d0, t0 __LF                                        \
+        sbcs    d1, d1, xzr __LF                                       \
+        mov     t1, #0x000000001fffffff __LF                           \
+        adcs    d2, d2, t1 __LF                                        \
+        mov     t2, #0x2000000000000000 __LF                           \
+        adcs    d3, d3, t2 __LF                                        \
+        mov     t0, #0x1fffffffe0000000 __LF                           \
+        adc     d4, d4, t0 __LF                                        \
+/* Let w = d0, the original word we use as offset; d0 gets recycled */      \
+/* First let [t2;t1] = 2^32 * w                                     */      \
+/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0)      */      \
+        lsl     t1, d0, #32 __LF                                       \
+        subs    t0, d0, t1 __LF                                        \
+        lsr     t2, d0, #32 __LF                                       \
+        sbc     d0, d0, t2 __LF                                        \
+/* Hence basic [d4;d3;d2;d1] += (2^256 - 2^224 + 2^192 + 2^96) * w  */      \
+        adds    d1, d1, t1 __LF                                        \
+        adcs    d2, d2, t2 __LF                                        \
+        adcs    d3, d3, t0 __LF                                        \
+        adcs    d4, d4, d0 __LF                                        \
+/* Now capture top carry and subtract p_256 if set (almost-Montgomery) */   \
+        mov     t0, #0xffffffffffffffff __LF                           \
+        mov     t1, #0x00000000ffffffff __LF                           \
+        mov     t2, #0xffffffff00000001 __LF                           \
+        csel    t0, t0, xzr, cs __LF                                   \
+        csel    t1, t1, xzr, cs __LF                                   \
+        csel    t2, t2, xzr, cs __LF                                   \
+        subs    d1, d1, t0 __LF                                        \
+        sbcs    d2, d2, t1 __LF                                        \
+        sbcs    d3, d3, xzr __LF                                       \
+        sbc     d4, d4, t2
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix in
+// registers as follows
+//
+// [ m00  m01]
+// [ m10  m11]
+
+#define divstep59()                                                     \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x8, x4, #0x100, lsl #12 __LF                               \
+        sbfx    x8, x8, #21, #21 __LF                                      \
+        mov     x11, #0x100000 __LF                                        \
+        add     x11, x11, x11, lsl #21 __LF                                \
+        add     x9, x4, x11 __LF                                           \
+        asr     x9, x9, #42 __LF                                           \
+        add     x10, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x10, x10, #21, #21 __LF                                    \
+        add     x11, x5, x11 __LF                                          \
+        asr     x11, x11, #42 __LF                                         \
+        mul     x6, x8, x2 __LF                                            \
+        mul     x7, x9, x3 __LF                                            \
+        mul     x2, x10, x2 __LF                                           \
+        mul     x3, x11, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #21, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #42 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #21, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #42 __LF                                         \
+        mul     x6, x12, x2 __LF                                           \
+        mul     x7, x13, x3 __LF                                           \
+        mul     x2, x14, x2 __LF                                           \
+        mul     x3, x15, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        mul     x2, x12, x8 __LF                                           \
+        mul     x3, x12, x9 __LF                                           \
+        mul     x6, x14, x8 __LF                                           \
+        mul     x7, x14, x9 __LF                                           \
+        madd    x8, x13, x10, x2 __LF                                      \
+        madd    x9, x13, x11, x3 __LF                                      \
+        madd    x16, x15, x10, x6 __LF                                     \
+        madd    x17, x15, x11, x7 __LF                                     \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #22, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #43 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #22, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #43 __LF                                         \
+        mneg    x2, x12, x8 __LF                                           \
+        mneg    x3, x12, x9 __LF                                           \
+        mneg    x4, x14, x8 __LF                                           \
+        mneg    x5, x14, x9 __LF                                           \
+        msub    m00, x13, x16, x2 __LF                                     \
+        msub    m01, x13, x17, x3 __LF                                     \
+        msub    m10, x15, x16, x4 __LF                                     \
+        msub    m11, x15, x17, x5
+
+S2N_BN_SYMBOL(bignum_inv_p256):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        stp     x21, x22, [sp, -16]!
+        stp     x23, x24, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Copy the prime and input into the main f and g variables respectively.
+// Make sure x is reduced so that g <= f as assumed in the bound proof.
+
+        mov     x10, #0xffffffffffffffff
+        mov     x11, #0x00000000ffffffff
+        mov     x13, #0xffffffff00000001
+        stp     x10, x11, [f]
+        stp     xzr, x13, [f+2*N]
+        str     xzr, [f+4*N]
+
+        ldp     x2, x3, [x1]
+        subs    x10, x2, x10
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #(2*N)]
+        sbcs    x12, x4, xzr
+        sbcs    x13, x5, x13
+
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+
+        stp     x2, x3, [g]
+        stp     x4, x5, [g+2*N]
+        str     xzr, [g+4*N]
+
+// Also maintain reduced < 2^256 vector [u,v] such that
+// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_256)
+// starting with [p_256,x] == x * 2^{5*0-50} * [0,2^50] (mod p_256)
+// The weird-looking 5*i modifications come in because we are doing
+// 64-bit word-sized Montgomery reductions at each stage, which is
+// 5 bits more than the 59-bit requirement to keep things stable.
+
+        stp     xzr, xzr, [u]
+        stp     xzr, xzr, [u+2*N]
+
+        mov     x10, #0x0004000000000000
+        stp     x10, xzr, [v]
+        stp     xzr, xzr, [v+2*N]
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special tenth iteration after a uniform
+// first 9.
+
+        mov     i, #10
+        mov     d, #1
+        b       bignum_inv_p256_midloop
+
+bignum_inv_p256_loop:
+
+// Separate the matrix elements into sign-magnitude pairs
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in stable registers for the [u,v] part and do [f,g] first.
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+        and     x0, m10, s10
+        and     x1, m11, s11
+        add     car1, x0, x1
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        ldr     x7, [f]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [g]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Digit 1 of [f,g]
+
+        ldr     x7, [f+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [g+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [f]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [g]
+
+// Digit 2 of [f,g]
+
+        ldr     x7, [f+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [g+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [f+N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [g+N]
+
+// Digits 3 and 4 of [f,g]
+
+        ldr     x7, [f+3*N]
+        eor     x1, x7, s00
+        ldr     x23, [f+4*N]
+        eor     x3, x23, s00
+        and     x3, x3, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [g+3*N]
+        eor     x1, x8, s01
+        ldr     x24, [g+4*N]
+        eor     x0, x24, s01
+        and     x0, x0, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [f+2*N]
+        extr    x5, x3, x5, #59
+        str     x5, [f+3*N]
+        asr     x3, x3, #59
+        str     x3, [f+4*N]
+
+        eor     x1, x7, s10
+        eor     x5, x23, s10
+        and     x5, x5, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        eor     x0, x24, s11
+        and     x0, x0, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [g+2*N]
+        extr    x2, x5, x2, #59
+        str     x2, [g+3*N]
+        asr     x5, x5, #59
+        str     x5, [g+4*N]
+
+// Now the computation of the updated u and v values and their
+// Montgomery reductions. A very similar accumulation except that
+// the top words of u and v are unsigned and we don't shift.
+//
+// Digit 0 of [u,v]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v]
+        adc     x3, x3, x1
+
+// Digit 1 of [u,v]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        str     x3, [v+N]
+        adc     x4, x4, x1
+
+// Digit 2 of [u,v]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        str     x4, [v+2*N]
+        adc     x2, x2, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Montgomery reduction of u
+
+        ldp     x0, x1, [u]
+        ldr     x6, [u+2*N]
+        amontred(x3,x5,x6,x1,x0, x10,x11,x14)
+        stp     x1, x6, [u]
+        stp     x5, x3, [u+16]
+
+// Digits 3 and 4 of v (top is unsigned)
+
+        eor     x1, x7, s10
+        and     x5, s10, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        and     x0, s11, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+
+// Montgomery reduction of v
+
+        ldp     x0, x1, [v]
+        ldr     x3, [v+2*N]
+        amontred(x5,x2,x3,x1,x0, x10,x11,x14)
+        stp     x1, x3, [v]
+        stp     x2, x5, [v+16]
+
+bignum_inv_p256_midloop:
+
+        mov     x1, d
+        ldr     x2, [f]
+        ldr     x3, [g]
+        divstep59()
+        mov     d, x1
+
+// Next iteration
+
+        subs    i, i, #1
+        bne     bignum_inv_p256_loop
+
+// The 10th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        ldr     x0, [f]
+        ldr     x1, [g]
+        mul     x0, x0, m00
+        madd    x1, x1, m01, x0
+        asr     x0, x1, #63
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * [u,v] (mod p_256)
+// we want to flip the sign of u according to that of f.
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+        eor     s00, s00, x0
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+        eor     s01, s01, x0
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+        eor     s10, s10, x0
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+        eor     s11, s11, x0
+
+// Adjust the initial value to allow for complement instead of negation
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+// Digit 0 of [u]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+// Digit 1 of [u]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+// Digit 2 of [u]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Montgomery reduction of u. This needs to be strict not "almost"
+// so it is followed by an optional subtraction of p_256
+
+        ldp     x0, x1, [u]
+        ldr     x2, [u+2*N]
+        amontred(x3,x5,x2,x1,x0, x10,x11,x14)
+
+        mov     x10, #0xffffffffffffffff
+        subs    x10, x1, x10
+        mov     x11, #0x00000000ffffffff
+        sbcs    x11, x2, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x5, xzr
+        sbcs    x13, x3, x13
+
+        csel    x10, x1, x10, cc
+        csel    x11, x2, x11, cc
+        csel    x12, x5, x12, cc
+        csel    x13, x3, x13, cc
+
+// Store it back to the final output
+
+        stp     x10, x11, [res]
+        stp     x12, x13, [res, #16]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_littleendian_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_littleendian_4.S
new file mode 100644
index 00000000000..84a4aec994d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_littleendian_4.S
@@ -0,0 +1,133 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert 4-digit (256-bit) bignum to/from little-endian form
+// Input x[4]; output z[4]
+//
+//    extern void bignum_littleendian_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The same function is given two other prototypes whose names reflect the
+// treatment of one or other argument as a byte array rather than word array:
+//
+//    extern void bignum_fromlebytes_4
+//     (uint64_t z[static 4], uint8_t x[static 32]);
+//
+//    extern void bignum_tolebytes_4
+//     (uint8_t z[static 32], uint64_t x[static 4]);
+//
+// The implementation works by loading in bytes and storing in words (i.e.
+// stylistically it is "fromlebytes"); in the more common little-endian
+// usage of ARM, this is just copying.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_littleendian_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_littleendian_4)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_fromlebytes_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_fromlebytes_4)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tolebytes_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tolebytes_4)
+
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d x2
+#define dshort w2
+#define a x3
+
+S2N_BN_SYMBOL(bignum_littleendian_4):
+S2N_BN_SYMBOL(bignum_fromlebytes_4):
+S2N_BN_SYMBOL(bignum_tolebytes_4):
+
+// word 0
+
+        ldrb    dshort, [x]
+        extr    a, d, xzr, #8
+        ldrb    dshort, [x, #1]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #2]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #3]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #4]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #5]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #6]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #7]
+        extr    a, d, a, #8
+        str     a, [z]
+
+// word 1
+
+        ldrb    dshort, [x, #8]
+        extr    a, d, xzr, #8
+        ldrb    dshort, [x, #9]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #10]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #11]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #12]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #13]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #14]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #15]
+        extr    a, d, a, #8
+        str     a, [z, #8]
+
+// word 2
+
+        ldrb    dshort, [x, #16]
+        extr    a, d, xzr, #8
+        ldrb    dshort, [x, #17]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #18]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #19]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #20]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #21]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #22]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #23]
+        extr    a, d, a, #8
+        str     a, [z, #16]
+
+// word 3
+
+        ldrb    dshort, [x, #24]
+        extr    a, d, xzr, #8
+        ldrb    dshort, [x, #25]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #26]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #27]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #28]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #29]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #30]
+        extr    a, d, a, #8
+        ldrb    dshort, [x, #31]
+        extr    a, d, a, #8
+        str     a, [z, #24]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256.S
new file mode 100644
index 00000000000..d99740d288f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256.S
@@ -0,0 +1,175 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_256
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_n256
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Reduction is modulo the group order of the NIST curve P-256.
+//
+// Standard ARM ABI: X0 = z, X1 = k, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define k x1
+#define x x2
+
+#define m0 x3
+#define m1 x4
+#define m2 x5
+#define m3 x6
+
+#define t0 x7
+#define t1 x8
+#define t2 x9
+#define t3 x10
+#define t4 x11
+
+#define n0 x12
+#define n1 x13
+#define n3 x14
+
+// These two are aliased: we only load d when finished with q
+
+#define q x15
+#define d x15
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                              \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_mod_n256):
+
+S2N_BN_SYMBOL(bignum_mod_n256_alt):
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmp     k, #4
+        bcc     bignum_mod_n256_short
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        sub     k, k, #4
+        lsl     t0, k, #3
+        add     t0, t0, x
+        ldp     m2, m3, [t0, #16]
+        ldp     m0, m1, [t0]
+
+// Load the complicated three words of 2^256 - n_256 = [n3; 0; n1; n0]
+
+        movbig( n0, #0x0c46, #0x353d, #0x039c, #0xdaaf)
+        movbig( n1, #0x4319, #0x0552, #0x58e8, #0x617b)
+        mov     n3, #0x00000000ffffffff
+
+// Reduce the top 4 digits mod n_256 (a conditional subtraction of n_256)
+
+        adds    t0, m0, n0
+        adcs    t1, m1, n1
+        adcs    t2, m2, xzr
+        adcs    t3, m3, n3
+        csel    m0, m0, t0, cc
+        csel    m1, m1, t1, cc
+        csel    m2, m2, t2, cc
+        csel    m3, m3, t3, cc
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        cbz     k, bignum_mod_n256_writeback
+bignum_mod_n256_loop:
+
+// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our
+// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1).
+
+        subs    xzr, xzr, xzr   // set carry flag for +1
+        extr    q, m3, m2, #32
+        adcs    xzr, m2, q
+        lsr     q, m3, #32
+        adcs    q, m3, q
+        csetm   t0, cs
+        orr     q, q, t0
+
+// [t4;t3;t2;t1;t0] = q * (2^256 - n_256)
+
+        mul     t0, n0, q
+        mul     t1, n1, q
+        mul     t3, n3, q
+        umulh   t2, n0, q
+        adds    t1, t1, t2
+        umulh   t2, n1, q
+        adc     t2, t2, xzr     // No carry: high of mul + {0,1}
+        umulh   t4, n3, q
+
+// Compensate for 2^256 * q
+
+        sub     m3, m3, q
+
+// Decrement k and load the next digit (note that d aliases to q)
+
+        sub     k, k, #1
+        ldr     d, [x, k, lsl #3]
+
+// [t4;t3;t2;t1;t0] = [m3;m2;m1;m0;d] - q * n_256
+
+        adds    t0, d, t0
+        adcs    t1, m0, t1
+        adcs    t2, m1, t2
+        adcs    t3, m2, t3
+        adc     t4, m3, t4
+
+// Now our top word t4 is either zero or all 1s. Use it for a masked
+// addition of n_256, which we can do by a *subtraction* of
+// 2^256 - n_256 from our portion, re-using the constants
+
+        and     d, t4, n0
+        subs    m0, t0, d
+        and     d, t4, n1
+        sbcs    m1, t1, d
+        sbcs    m2, t2, xzr
+        and     d, t4, n3
+        sbc     m3, t3, d
+
+        cbnz    k, bignum_mod_n256_loop
+
+// Finally write back [m3;m2;m1;m0] and return
+
+bignum_mod_n256_writeback:
+        stp     m0, m1, [z]
+        stp     m2, m3, [z, #16]
+        ret
+
+// Short case: just copy the input with zero-padding
+
+bignum_mod_n256_short:
+        mov     m0, xzr
+        mov     m1, xzr
+        mov     m2, xzr
+        mov     m3, xzr
+
+        cbz     k, bignum_mod_n256_writeback
+        ldr     m0, [x]
+        subs    k, k, #1
+        beq     bignum_mod_n256_writeback
+        ldr     m1, [x, #8]
+        subs    k, k, #1
+        beq     bignum_mod_n256_writeback
+        ldr     m2, [x, #16]
+        b       bignum_mod_n256_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256_4.S
new file mode 100644
index 00000000000..4ea3c905347
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256_4.S
@@ -0,0 +1,81 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_n256_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the group order of the NIST curve P-256.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256_4)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define n0 x2
+#define n1 x3
+#define n2 x4
+#define n3 x5
+
+#define d0 x6
+#define d1 x7
+#define d2 x8
+#define d3 x9
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                              \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_mod_n256_4):
+
+// Load the complicated three words of n_256, the other being all 1s
+
+        movbig( n0, #0xf3b9, #0xcac2, #0xfc63, #0x2551)
+        movbig( n1, #0xbce6, #0xfaad, #0xa717, #0x9e84)
+        mov     n3, #0xffffffff00000000
+
+// Load the input number
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Do the subtraction. Since word 2 of n_256 is all 1s, that can be
+// done by adding zero with carry, thanks to the inverted carry.
+
+        subs    n0, d0, n0
+        sbcs    n1, d1, n1
+        adcs    n2, d2, xzr
+        sbcs    n3, d3, n3
+
+// Now if the carry is *clear* (inversion at work) the subtraction carried
+// and hence we should have done nothing, so we reset each n_i = d_i
+
+        csel    n0, d0, n0, cc
+        csel    n1, d1, n1, cc
+        csel    n2, d2, n2, cc
+        csel    n3, d3, n3, cc
+
+// Store the end result
+
+        stp     n0, n1, [z]
+        stp     n2, n3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256.S
new file mode 100644
index 00000000000..b53ee5b7791
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256.S
@@ -0,0 +1,162 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_256
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_p256
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Standard ARM ABI: X0 = z, X1 = k, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define k x1
+#define x x2
+
+#define m0 x3
+#define m1 x4
+#define m2 x5
+#define m3 x6
+
+#define t0 x7
+#define t1 x8
+#define t2 x9
+#define t3 x10
+#define t4 x11
+
+#define n1 x12
+#define n3 x13
+
+#define q x14
+
+
+S2N_BN_SYMBOL(bignum_mod_p256):
+
+S2N_BN_SYMBOL(bignum_mod_p256_alt):
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmp     k, #4
+        bcc     bignum_mod_p256_short
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        sub     k, k, #4
+        lsl     t0, k, #3
+        add     t0, t0, x
+        ldp     m2, m3, [t0, #16]
+        ldp     m0, m1, [t0]
+
+// Load the complicated words of p_256 = [n3;0;n1;-1]
+
+        mov     n1, #0x00000000ffffffff
+        mov     n3, #0xffffffff00000001
+
+// Reduce the top 4 digits mod p_256 (a conditional subtraction of p_256)
+
+        subs    t0, m0, #-1
+        sbcs    t1, m1, n1
+        sbcs    t2, m2, xzr
+        sbcs    t3, m3, n3
+
+        csel    m0, m0, t0, cc
+        csel    m1, m1, t1, cc
+        csel    m2, m2, t2, cc
+        csel    m3, m3, t3, cc
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        cbz     k, bignum_mod_p256_writeback
+bignum_mod_p256_loop:
+
+// Decrement k and load the next digit as t0. We then want to reduce
+// [m3;m2;m1;m0;t0] |-> [m3;m2;m1;m0]; the shuffling downwards is absorbed
+// into the various ALU operations
+
+        sub     k, k, #1
+        ldr     t0, [x, k, lsl #3]
+
+// Writing the input as z = 2^256 * h + 2^192 * l + t = 2^192 * hl + t, our
+// intended quotient approximation is MIN ((hl + hl>>32 + 1)>>64) (2^64 - 1).
+
+        subs    xzr, xzr, xzr   // set carry flag for +1
+        extr    q, m3, m2, #32
+        adcs    xzr, m2, q
+        lsr     q, m3, #32
+        adcs    q, m3, q
+        csetm   t1, cs
+        orr     q, q, t1
+
+// First do [t2;t1] = 2^32 * q, which we use twice
+
+        lsl     t1, q, #32
+        lsr     t2, q, #32
+
+// Add 2^224 * q to sum
+
+        adds    t3, m2, t1
+        adc     t4, m3, t2
+
+// Accumulate [t2;t1;m3] = (2^96 - 1) * q, using m3 briefly as a temporary
+
+        subs    m3, xzr, q
+        sbcs    t1, t1, xzr
+        sbc     t2, t2, xzr
+
+// Subtract (2^256 + 2^192 + 2^96 - 1) * q
+
+        subs    t0, t0, m3
+        sbcs    t1, m0, t1
+        sbcs    t2, m1, t2
+        sbcs    t3, t3, q
+        sbc     t4, t4, q
+
+// Use top word as mask to correct
+
+        adds    m0, t0, t4
+        and     t0, n1, t4
+        adcs    m1, t1, t0
+        adcs    m2, t2, xzr
+        and     t0, n3, t4
+        adc     m3, t3, t0
+
+        cbnz    k, bignum_mod_p256_loop
+
+// Finally write back [m3;m2;m1;m0] and return
+
+bignum_mod_p256_writeback:
+        stp     m0, m1, [z]
+        stp     m2, m3, [z, #16]
+        ret
+
+// Short case: just copy the input with zero-padding
+
+bignum_mod_p256_short:
+        mov     m0, xzr
+        mov     m1, xzr
+        mov     m2, xzr
+        mov     m3, xzr
+
+        cbz     k, bignum_mod_p256_writeback
+        ldr     m0, [x]
+        subs    k, k, #1
+        beq     bignum_mod_p256_writeback
+        ldr     m1, [x, #8]
+        subs    k, k, #1
+        beq     bignum_mod_p256_writeback
+        ldr     m2, [x, #16]
+        b       bignum_mod_p256_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256_4.S
new file mode 100644
index 00000000000..0a910eccfe5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256_4.S
@@ -0,0 +1,71 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_p256_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256_4)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define n0 x2
+#define n1 x3
+#define n2 x4
+#define n3 x5
+
+#define d0 x6
+#define d1 x7
+#define d2 x8
+#define d3 x9
+
+
+S2N_BN_SYMBOL(bignum_mod_p256_4):
+
+// Load the three nonzero words of p_256 = [n3;0;n2;n1]
+
+        mov     n0, #0xffffffffffffffff
+        mov     n1, #0x00000000ffffffff
+        mov     n3, #0xffffffff00000001
+
+// Load the input number
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Do the subtraction.
+
+        subs    n0, d0, n0
+        sbcs    n1, d1, n1
+        sbcs    n2, d2, xzr
+        sbcs    n3, d3, n3
+
+// Now if the carry is *clear* (inversion at work) the subtraction carried
+// and hence we should have done nothing, so we reset each n_i = d_i
+
+        csel    n0, d0, n0, cc
+        csel    n1, d1, n1, cc
+        csel    n2, d2, n2, cc
+        csel    n3, d3, n3, cc
+
+// Store the end result
+
+        stp     n0, n1, [z]
+        stp     n2, n3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montinv_p256.S
new file mode 100644
index 00000000000..46f71514aa4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montinv_p256.S
@@ -0,0 +1,1303 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+//
+// extern void bignum_montinv_p256(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// If the 4-digit input x is coprime to p_256, i.e. is not divisible
+// by it, returns z < p_256 such that x * z == 2^512 (mod p_256). This
+// is effectively "Montgomery inverse" because if we consider x and z as
+// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z
+// (both mod p_256) then X * Z == 1 (mod p_256). That is, this function
+// gives the analog of the modular inverse bignum_inv_p256 but with both
+// input and output in the Montgomery domain. Note that x does not need
+// to be reduced modulo p_256, but the output always is. If the input
+// is divisible (i.e. is 0 or p_256), then there can be no solution to
+// the congruence x * z == 2^512 (mod p_256), and z = 0 is returned.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p256)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Used for the return pointer
+
+#define res x20
+
+// Loop counter and d = 2 * delta value for divstep
+
+#define i x21
+#define d x22
+
+// Registers used for matrix element magnitudes and signs
+
+#define m00 x10
+#define m01 x11
+#define m10 x12
+#define m11 x13
+#define s00 x14
+#define s01 x15
+#define s10 x16
+#define s11 x17
+
+// Initial carries for combinations
+
+#define car0 x9
+#define car1 x19
+
+// Input and output, plain registers treated according to pattern
+
+#define reg0 x0, #0
+#define reg1 x1, #0
+#define reg2 x2, #0
+#define reg3 x3, #0
+#define reg4 x4, #0
+
+#define x x1, #0
+#define z x0, #0
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f sp, #0
+#define g sp, #(6*N)
+#define u sp, #(12*N)
+#define v sp, #(16*N)
+
+// Total size to reserve on the stack
+
+#define NSPACE #(20*N)
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+// ---------------------------------------------------------------------------
+// Core signed almost-Montgomery reduction macro. Takes input in
+// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to
+// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally
+// as well as t0, t1, t2. This is almost-Montgomery, i.e. the result fits
+// in 4 digits but is not necessarily strictly reduced mod p_256.
+// ---------------------------------------------------------------------------
+
+#define amontred(d4,d3,d2,d1,d0, t2,t1,t0)                                  \
+/* We only know the input is -2^316 < x < 2^316. To do traditional  */      \
+/* unsigned Montgomery reduction, start by adding 2^61 * p_256.     */      \
+        mov     t0, #0xe000000000000000 __LF                           \
+        adds    d0, d0, t0 __LF                                        \
+        sbcs    d1, d1, xzr __LF                                       \
+        mov     t1, #0x000000001fffffff __LF                           \
+        adcs    d2, d2, t1 __LF                                        \
+        mov     t2, #0x2000000000000000 __LF                           \
+        adcs    d3, d3, t2 __LF                                        \
+        mov     t0, #0x1fffffffe0000000 __LF                           \
+        adc     d4, d4, t0 __LF                                        \
+/* Let w = d0, the original word we use as offset; d0 gets recycled */      \
+/* First let [t2;t1] = 2^32 * w                                     */      \
+/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0)      */      \
+        lsl     t1, d0, #32 __LF                                       \
+        subs    t0, d0, t1 __LF                                        \
+        lsr     t2, d0, #32 __LF                                       \
+        sbc     d0, d0, t2 __LF                                        \
+/* Hence basic [d4;d3;d2;d1] += (2^256 - 2^224 + 2^192 + 2^96) * w  */      \
+        adds    d1, d1, t1 __LF                                        \
+        adcs    d2, d2, t2 __LF                                        \
+        adcs    d3, d3, t0 __LF                                        \
+        adcs    d4, d4, d0 __LF                                        \
+/* Now capture top carry and subtract p_256 if set (almost-Montgomery) */   \
+        mov     t0, #0xffffffffffffffff __LF                           \
+        mov     t1, #0x00000000ffffffff __LF                           \
+        mov     t2, #0xffffffff00000001 __LF                           \
+        csel    t0, t0, xzr, cs __LF                                   \
+        csel    t1, t1, xzr, cs __LF                                   \
+        csel    t2, t2, xzr, cs __LF                                   \
+        subs    d1, d1, t0 __LF                                        \
+        sbcs    d2, d2, t1 __LF                                        \
+        sbcs    d3, d3, xzr __LF                                       \
+        sbc     d4, d4, t2
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix in
+// registers as follows
+//
+// [ m00  m01]
+// [ m10  m11]
+
+#define divstep59()                                                     \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x8, x4, #0x100, lsl #12 __LF                               \
+        sbfx    x8, x8, #21, #21 __LF                                      \
+        mov     x11, #0x100000 __LF                                        \
+        add     x11, x11, x11, lsl #21 __LF                                \
+        add     x9, x4, x11 __LF                                           \
+        asr     x9, x9, #42 __LF                                           \
+        add     x10, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x10, x10, #21, #21 __LF                                    \
+        add     x11, x5, x11 __LF                                          \
+        asr     x11, x11, #42 __LF                                         \
+        mul     x6, x8, x2 __LF                                            \
+        mul     x7, x9, x3 __LF                                            \
+        mul     x2, x10, x2 __LF                                           \
+        mul     x3, x11, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #21, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #42 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #21, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #42 __LF                                         \
+        mul     x6, x12, x2 __LF                                           \
+        mul     x7, x13, x3 __LF                                           \
+        mul     x2, x14, x2 __LF                                           \
+        mul     x3, x15, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        mul     x2, x12, x8 __LF                                           \
+        mul     x3, x12, x9 __LF                                           \
+        mul     x6, x14, x8 __LF                                           \
+        mul     x7, x14, x9 __LF                                           \
+        madd    x8, x13, x10, x2 __LF                                      \
+        madd    x9, x13, x11, x3 __LF                                      \
+        madd    x16, x15, x10, x6 __LF                                     \
+        madd    x17, x15, x11, x7 __LF                                     \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #22, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #43 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #22, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #43 __LF                                         \
+        mneg    x2, x12, x8 __LF                                           \
+        mneg    x3, x12, x9 __LF                                           \
+        mneg    x4, x14, x8 __LF                                           \
+        mneg    x5, x14, x9 __LF                                           \
+        msub    m00, x13, x16, x2 __LF                                     \
+        msub    m01, x13, x17, x3 __LF                                     \
+        msub    m10, x15, x16, x4 __LF                                     \
+        msub    m11, x15, x17, x5
+
+S2N_BN_SYMBOL(bignum_montinv_p256):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        stp     x21, x22, [sp, -16]!
+        stp     x23, x24, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Copy the prime and input into the main f and g variables respectively.
+// Make sure x is reduced so that g <= f as assumed in the bound proof.
+
+        mov     x10, #0xffffffffffffffff
+        mov     x11, #0x00000000ffffffff
+        mov     x13, #0xffffffff00000001
+        stp     x10, x11, [f]
+        stp     xzr, x13, [f+2*N]
+        str     xzr, [f+4*N]
+
+        ldp     x2, x3, [x1]
+        subs    x10, x2, x10
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #(2*N)]
+        sbcs    x12, x4, xzr
+        sbcs    x13, x5, x13
+
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+
+        stp     x2, x3, [g]
+        stp     x4, x5, [g+2*N]
+        str     xzr, [g+4*N]
+
+// Also maintain reduced < 2^256 vector [u,v] such that
+// [f,g] == x * 2^{5*i-562} * [u,v] (mod p_256)
+// starting with [p_256,x] == x * 2^{5*0-562} * [0,2^562] (mod p_256)
+// The weird-looking 5*i modifications come in because we are doing
+// 64-bit word-sized Montgomery reductions at each stage, which is
+// 5 bits more than the 59-bit requirement to keep things stable.
+// After the 10th and last iteration and sign adjustment, when
+// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e.
+// x * u == 2^512 as required.
+
+        stp     xzr, xzr, [u]
+        stp     xzr, xzr, [u+2*N]
+
+// The starting constant 2^562 mod p_256 is
+// 0x000bffffffebffff:fffbffffffefffff:ffe8000000000000:000c000000140000
+// where colons separate 64-bit subwords, least significant at the right.
+// Only word number 1, value 0xffe8000000000000, is a single ARM move.
+
+        mov     x10, #0x0000000000140000
+        orr     x10, x10, #0x000c000000000000
+
+        mov     x11, #0xffe8000000000000
+
+        movbig(x13, #0x000b, #0xffff, #0xffef, #0xffff)
+        orr     x12, x13, #0xfff0000000000000
+        and     x13, x13, #0xfffffffffffbffff
+
+        stp     x10, x11, [v]
+        stp     x12, x13, [v+2*N]
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special tenth iteration after a uniform
+// first 9.
+
+        mov     i, #10
+        mov     d, #1
+        b       bignum_montinv_p256_midloop
+
+bignum_montinv_p256_loop:
+
+// Separate the matrix elements into sign-magnitude pairs
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in stable registers for the [u,v] part and do [f,g] first.
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+        and     x0, m10, s10
+        and     x1, m11, s11
+        add     car1, x0, x1
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        ldr     x7, [f]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [g]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Digit 1 of [f,g]
+
+        ldr     x7, [f+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [g+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [f]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [g]
+
+// Digit 2 of [f,g]
+
+        ldr     x7, [f+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [g+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [f+N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [g+N]
+
+// Digits 3 and 4 of [f,g]
+
+        ldr     x7, [f+3*N]
+        eor     x1, x7, s00
+        ldr     x23, [f+4*N]
+        eor     x3, x23, s00
+        and     x3, x3, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [g+3*N]
+        eor     x1, x8, s01
+        ldr     x24, [g+4*N]
+        eor     x0, x24, s01
+        and     x0, x0, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [f+2*N]
+        extr    x5, x3, x5, #59
+        str     x5, [f+3*N]
+        asr     x3, x3, #59
+        str     x3, [f+4*N]
+
+        eor     x1, x7, s10
+        eor     x5, x23, s10
+        and     x5, x5, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        eor     x0, x24, s11
+        and     x0, x0, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [g+2*N]
+        extr    x2, x5, x2, #59
+        str     x2, [g+3*N]
+        asr     x5, x5, #59
+        str     x5, [g+4*N]
+
+// Now the computation of the updated u and v values and their
+// Montgomery reductions. A very similar accumulation except that
+// the top words of u and v are unsigned and we don't shift.
+//
+// Digit 0 of [u,v]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v]
+        adc     x3, x3, x1
+
+// Digit 1 of [u,v]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        str     x3, [v+N]
+        adc     x4, x4, x1
+
+// Digit 2 of [u,v]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        str     x4, [v+2*N]
+        adc     x2, x2, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Montgomery reduction of u
+
+        ldp     x0, x1, [u]
+        ldr     x6, [u+2*N]
+        amontred(x3,x5,x6,x1,x0, x10,x11,x14)
+        stp     x1, x6, [u]
+        stp     x5, x3, [u+16]
+
+// Digits 3 and 4 of v (top is unsigned)
+
+        eor     x1, x7, s10
+        and     x5, s10, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        and     x0, s11, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+
+// Montgomery reduction of v
+
+        ldp     x0, x1, [v]
+        ldr     x3, [v+2*N]
+        amontred(x5,x2,x3,x1,x0, x10,x11,x14)
+        stp     x1, x3, [v]
+        stp     x2, x5, [v+16]
+
+bignum_montinv_p256_midloop:
+
+        mov     x1, d
+        ldr     x2, [f]
+        ldr     x3, [g]
+        divstep59()
+        mov     d, x1
+
+// Next iteration
+
+        subs    i, i, #1
+        bne     bignum_montinv_p256_loop
+
+// The 10th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        ldr     x0, [f]
+        ldr     x1, [g]
+        mul     x0, x0, m00
+        madd    x1, x1, m01, x0
+        asr     x0, x1, #63
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * 2^{-512} [u,v] (mod p_256)
+// we want to flip the sign of u according to that of f.
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+        eor     s00, s00, x0
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+        eor     s01, s01, x0
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+        eor     s10, s10, x0
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+        eor     s11, s11, x0
+
+// Adjust the initial value to allow for complement instead of negation
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+// Digit 0 of [u]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+// Digit 1 of [u]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+// Digit 2 of [u]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Montgomery reduction of u. This needs to be strict not "almost"
+// so it is followed by an optional subtraction of p_256
+
+        ldp     x0, x1, [u]
+        ldr     x2, [u+2*N]
+        amontred(x3,x5,x2,x1,x0, x10,x11,x14)
+
+        mov     x10, #0xffffffffffffffff
+        subs    x10, x1, x10
+        mov     x11, #0x00000000ffffffff
+        sbcs    x11, x2, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x5, xzr
+        sbcs    x13, x3, x13
+
+        csel    x10, x1, x10, cc
+        csel    x11, x2, x11, cc
+        csel    x12, x5, x12, cc
+        csel    x13, x3, x13, cc
+
+// Store it back to the final output
+
+        stp     x10, x11, [res]
+        stp     x12, x13, [res, #16]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256.S
new file mode 100644
index 00000000000..d02aa3c6b62
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256.S
@@ -0,0 +1,462 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_256
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_p256
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in
+// the "usual" case x < p_256 and y < p_256).
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+
+// bignum_montmul_p256 is functionally equivalent to
+// unopt/bignum_montmul_p256_base.
+// It is written in a way that
+// 1. A subset of scalar multiplications in bignum_montmul_p256_base are carefully
+//    chosen and vectorized
+// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer.
+//    https://github.com/slothy-optimizer/slothy
+//
+// The output program of step 1. is as follows:
+//
+//        ldp x7, x13, [x1]
+//        ldr q16, [x1]
+//        ldp x9, x15, [x1, #16]
+//        ldp x14, x4, [x2]
+//        ldr q19, [x2]
+//        ldp x12, x16, [x2, #16]
+//        ldr q29, [x1, #16]
+//        ldr q30, [x2, #16]
+//        uzp1 v17.4S, v19.4S, v16.4S
+//        rev64 v18.4S, v19.4S
+//        uzp1 v28.4S, v16.4S, v16.4S
+//        mul v24.4S, v18.4S, v16.4S
+//        uaddlp v18.2D, v24.4S
+//        shl v16.2D, v18.2D, #32
+//        umlal v16.2D, v28.2S, v17.2S
+//        mov x2, v16.d[0]
+//        mov x1, v16.d[1]
+//        umulh x5, x7, x14
+//        adds x17, x2, x1
+//        umulh x3, x13, x4
+//        adcs x8, x5, x3
+//        adcs x10, x3, xzr
+//        adds x5, x5, x17
+//        adcs x1, x1, x8
+//        adcs x8, x10, xzr
+//        subs x17, x7, x13
+//        cneg x3, x17, cc
+//        csetm x11, cc
+//        subs x10, x4, x14
+//        cneg x6, x10, cc
+//        mul x17, x3, x6
+//        umulh x6, x3, x6
+//        cinv x11, x11, cc
+//        eor x17, x17, x11
+//        eor x3, x6, x11
+//        cmn x11, #0x1
+//        adcs x5, x5, x17
+//        adcs x10, x1, x3
+//        adc x1, x8, x11
+//        lsl x3, x2, #32
+//        subs x17, x2, x3
+//        lsr x11, x2, #32
+//        sbc x8, x2, x11
+//        adds x2, x5, x3
+//        adcs x6, x10, x11
+//        adcs x3, x1, x17
+//        adc x10, x8, xzr
+//        lsl x5, x2, #32
+//        subs x17, x2, x5
+//        lsr x11, x2, #32
+//        sbc x8, x2, x11
+//        adds x2, x6, x5
+//        adcs x6, x3, x11
+//        adcs x1, x10, x17
+//        adc x17, x8, xzr
+//        stp x2, x6, [x0]                        // @slothy:writes=buffer0
+//        stp x1, x17, [x0, #16]                  // @slothy:writes=buffer16
+//        movi v28.2D, #0x00000000ffffffff
+//        uzp2 v22.4S, v30.4S, v30.4S
+//        xtn v4.2S, v29.2D
+//        xtn v27.2S, v30.2D
+//        rev64 v23.4S, v30.4S
+//        umull v17.2D, v4.2S, v27.2S
+//        umull v7.2D, v4.2S, v22.2S
+//        uzp2 v16.4S, v29.4S, v29.4S
+//        mul v29.4S, v23.4S, v29.4S
+//        usra v7.2D, v17.2D, #32
+//        umull v30.2D, v16.2S, v22.2S
+//        uaddlp v20.2D, v29.4S
+//        and v18.16B, v7.16B, v28.16B
+//        umlal v18.2D, v16.2S, v27.2S
+//        shl v16.2D, v20.2D, #32
+//        usra v30.2D, v7.2D, #32
+//        umlal v16.2D, v4.2S, v27.2S
+//        usra v30.2D, v18.2D, #32
+//        mov x11, v16.d[0]
+//        mov x5, v16.d[1]
+//        mov x2, v30.d[0]
+//        adds x3, x11, x5
+//        mov x17, v30.d[1]
+//        adcs x8, x2, x17
+//        adcs x1, x17, xzr
+//        adds x17, x2, x3
+//        adcs x8, x5, x8
+//        adcs x1, x1, xzr
+//        subs x2, x9, x15
+//        cneg x6, x2, cc
+//        csetm x3, cc
+//        subs x2, x16, x12
+//        cneg x5, x2, cc
+//        mul x10, x6, x5
+//        umulh x5, x6, x5
+//        cinv x3, x3, cc
+//        eor x10, x10, x3
+//        eor x6, x5, x3
+//        cmn x3, #0x1
+//        adcs x2, x17, x10
+//        adcs x6, x8, x6
+//        adc x5, x1, x3
+//        subs x7, x9, x7
+//        sbcs x3, x15, x13
+//        ngc x17, xzr
+//        cmn x17, #0x1
+//        eor x8, x7, x17
+//        adcs x13, x8, xzr
+//        eor x15, x3, x17
+//        adcs x1, x15, xzr
+//        subs x9, x14, x12
+//        sbcs x14, x4, x16
+//        ngc x3, xzr
+//        cmn x3, #0x1
+//        eor x12, x9, x3
+//        adcs x7, x12, xzr
+//        eor x12, x14, x3
+//        adcs x12, x12, xzr
+//        eor x10, x17, x3
+//        ldp x4, x15, [x0]                       // @slothy:reads=buffer0
+//        adds x17, x11, x4
+//        adcs x16, x2, x15
+//        ldp x3, x15, [x0, #16]                  // @slothy:reads=buffer16
+//        adcs x11, x6, x3
+//        adcs x9, x5, x15
+//        adc x14, xzr, xzr
+//        mul x6, x13, x7
+//        mul x8, x1, x12
+//        umulh x5, x13, x7
+//        adds x3, x6, x8
+//        umulh x2, x1, x12
+//        adcs x4, x5, x2
+//        adcs x15, x2, xzr
+//        adds x3, x5, x3
+//        adcs x4, x8, x4
+//        adcs x15, x15, xzr
+//        subs x1, x13, x1
+//        cneg x8, x1, cc
+//        csetm x5, cc
+//        subs x1, x12, x7
+//        cneg x2, x1, cc
+//        mul x7, x8, x2
+//        umulh x2, x8, x2
+//        cinv x13, x5, cc
+//        eor x7, x7, x13
+//        eor x2, x2, x13
+//        cmn x13, #0x1
+//        adcs x3, x3, x7
+//        adcs x4, x4, x2
+//        adc x5, x15, x13
+//        cmn x10, #0x1
+//        eor x8, x6, x10
+//        adcs x15, x8, x17
+//        eor x2, x3, x10
+//        adcs x2, x2, x16
+//        eor x6, x4, x10
+//        adcs x3, x6, x11
+//        eor x7, x5, x10
+//        adcs x1, x7, x9
+//        adcs x13, x14, x10
+//        adcs x12, x10, xzr
+//        adc x10, x10, xzr
+//        adds x5, x3, x17
+//        adcs x8, x1, x16
+//        adcs x13, x13, x11
+//        adcs x6, x12, x9
+//        adc x4, x10, x14
+//        lsl x9, x15, #32
+//        subs x7, x15, x9
+//        lsr x1, x15, #32
+//        sbc x14, x15, x1
+//        adds x10, x2, x9
+//        adcs x15, x5, x1
+//        adcs x5, x8, x7
+//        adc x7, x14, xzr
+//        lsl x12, x10, #32
+//        subs x17, x10, x12
+//        lsr x9, x10, #32
+//        sbc x3, x10, x9
+//        adds x12, x15, x12
+//        adcs x5, x5, x9
+//        adcs x14, x7, x17
+//        adc x2, x3, xzr
+//        adds x14, x13, x14
+//        adcs x6, x6, x2
+//        adc x17, x4, xzr
+//        add x7, x17, #0x1
+//        lsl x16, x7, #32
+//        adds x3, x6, x16
+//        adc x1, x17, xzr
+//        neg x15, x7
+//        sub x13, x16, #0x1
+//        subs x9, x12, x15
+//        sbcs x8, x5, x13
+//        sbcs x15, x14, xzr
+//        sbcs x3, x3, x7
+//        sbcs x7, x1, x7
+//        adds x4, x9, x7
+//        mov x6, #0xffffffff
+//        and x17, x6, x7
+//        adcs x8, x8, x17
+//        adcs x5, x15, xzr
+//        mov x10, #0xffffffff00000001
+//        and x1, x10, x7
+//        adc x12, x3, x1
+//        stp x4, x8, [x0]                        // @slothy:writes=buffer0
+//        stp x5, x12, [x0, #16]                  // @slothy:writes=buffer16
+//        ret
+//
+// The bash script used for step 2 is as follows:
+//
+//        # Store the assembly instructions except the last 'ret' as, say, 'input.S'
+//        export OUTPUTS="[hint_buffer0,hint_buffer16]"
+//        export RESERVED_REGS="[x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]"
+//        <s2n-bignum>/tools/external/slothy.sh input.S my_out_dir
+//        # my_out_dir/3.opt.s is the optimized assembly. Its output may differ
+//        # from this file since the sequence is non-deterministically chosen.
+//        # Please add 'ret' at the end of the output assembly.
+
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256)
+        .text
+        .balign 4
+
+S2N_BN_SYMBOL(bignum_montmul_p256):
+
+        ldr q20, [x2]
+        ldp x7, x17, [x1]
+        ldr q0, [x1]
+        ldp x6, x10, [x2]
+        ldp x11, x15, [x1, #16]
+        rev64 v16.4S, v20.4S
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4S, v16.4S, v0.4S
+        umulh x12, x17, x10
+        uzp1 v28.4S, v20.4S, v0.4S
+        subs x14, x11, x7
+        ldr q20, [x2, #16]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2D, v16.4S
+        umulh x4, x7, x6
+        uzp1 v21.4S, v0.4S, v0.4S
+        cneg x11, x8, cc
+        shl v17.2D, v27.2D, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2D, v21.2S, v28.2S
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [x1, #16]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2S, v20.2D
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4S, v20.4S, v20.4S
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2S, v28.2D
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [x2, #16]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x9, x3, x13
+        adcs x3, x8, x7
+        umulh x8, x14, x11
+        umull v21.2D, v0.2S, v1.2S
+        adcs x12, x10, x12
+        umull v3.2D, v0.2S, v16.2S
+        adc x15, x15, xzr
+        rev64 v24.4S, v20.4S
+        stp x12, x15, [x0, #16]
+        movi v2.2D, #0x00000000ffffffff
+        mul x10, x14, x11
+        mul v4.4S, v24.4S, v28.4S
+        subs x13, x14, x5
+        uzp2 v19.4S, v28.4S, v28.4S
+        csetm x15, cc
+        usra v3.2D, v21.2D, #32
+        mul x7, x5, x1
+        umull v21.2D, v19.2S, v16.2S
+        cneg x13, x13, cc
+        uaddlp v5.2D, v4.4S
+        subs x11, x1, x11
+        and v16.16B, v3.16B, v2.16B
+        umulh x5, x5, x1
+        shl v24.2D, v5.2D, #32
+        cneg x11, x11, cc
+        umlal v16.2D, v19.2S, v1.2S
+        cinv x12, x15, cc
+        umlal v24.2D, v0.2S, v1.2S
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        stp x9, x3, [x0]
+        usra v21.2D, v3.2D, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2D, v16.2D, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        ldp x15, x8, [x0]
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        ldp x9, x13, [x0, #16]
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x15
+        adcs x15, x16, x8
+        eor x5, x17, x4
+        adcs x9, x1, x9
+        eor x1, x10, x5
+        adcs x16, x2, x13
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x11, x11, x13
+        and x1, x1, x13
+        adcs x4, x4, x1
+        and x1, x12, x13
+        stp x11, x4, [x0]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [x0, #16]
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256_alt.S
new file mode 100644
index 00000000000..98a396eec99
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256_alt.S
@@ -0,0 +1,205 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_256
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_p256_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in
+// the "usual" case x < p_256 and y < p_256).
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256_alt)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1] and generating d4 from zero, re-using
+// d0 as a temporary internally together with tmp. The "mc" parameter is
+// assumed to be a register whose value is 0xFFFFFFFF00000001.
+// It is fine for d4 to be the same register as d0, and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, tmp,mc)                            \
+        adds    d1, d1, d0, lsl #32 __LF                               \
+        lsr     tmp, d0, #32 __LF                                      \
+        adcs    d2, d2, tmp __LF                                       \
+        mul     tmp, d0, mc __LF                                       \
+        umulh   d4, d0, mc __LF                                        \
+        adcs    d3, d3, tmp __LF                                       \
+        adc     d4, d4, xzr
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define l x11
+
+#define u0 x12
+#define u1 x13
+#define u2 x14
+#define u3 x15
+#define u4 x16
+
+// These alias to the input arguments when no longer needed
+
+#define u5 a0
+#define u6 a1
+#define u7 a2
+#define h a3
+#define mc b3
+
+S2N_BN_SYMBOL(bignum_montmul_p256_alt):
+
+// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        mul     u0, a0, b0
+        umulh   u1, a0, b0
+        mul     l, a0, b1
+        umulh   u2, a0, b1
+        adds    u1, u1, l
+
+        ldp     b2, b3, [y, #16]
+
+        mul     l, a0, b2
+        umulh   u3, a0, b2
+        adcs    u2, u2, l
+
+        mul     l, a0, b3
+        umulh   u4, a0, b3
+        adcs    u3, u3, l
+        adc     u4, u4, xzr
+
+        ldp     a2, a3, [x, #16]
+
+// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0]
+
+        mul     l, a1, b0
+        adds    u1, u1, l
+        mul     l, a1, b1
+        adcs    u2, u2, l
+        mul     l, a1, b2
+        adcs    u3, u3, l
+        mul     l, a1, b3
+        adcs    u4, u4, l
+        umulh   u5, a1, b3
+        adc     u5, u5, xzr
+
+        umulh   l, a1, b0
+        adds    u2, u2, l
+        umulh   l, a1, b1
+        adcs    u3, u3, l
+        umulh   l, a1, b2
+        adcs    u4, u4, l
+        adc     u5, u5, xzr
+
+// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0]
+
+        mul     l, a2, b0
+        adds    u2, u2, l
+        mul     l, a2, b1
+        adcs    u3, u3, l
+        mul     l, a2, b2
+        adcs    u4, u4, l
+        mul     l, a2, b3
+        adcs    u5, u5, l
+        umulh   u6, a2, b3
+        adc     u6, u6, xzr
+
+        umulh   l, a2, b0
+        adds    u3, u3, l
+        umulh   l, a2, b1
+        adcs    u4, u4, l
+        umulh   l, a2, b2
+        adcs    u5, u5, l
+        adc     u6, u6, xzr
+
+// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0]
+// Interleave the first Montgomery rotation of the low half
+
+        mul     l, a3, b0
+        adds    u3, u3, l
+        mul     l, a3, b1
+        adcs    u4, u4, l
+        mul     l, a3, b2
+        adcs    u5, u5, l
+        mul     l, a3, b3
+        adcs    u6, u6, l
+        umulh   u7, a3, b3
+        adc     u7, u7, xzr
+
+        mov mc, 0xFFFFFFFF00000001
+        montreds(u0,u3,u2,u1,u0, l,mc)
+
+        umulh   l, a3, b0
+        adds    u4, u4, l
+        umulh   l, a3, b1
+        adcs    u5, u5, l
+        umulh   l, a3, b2
+        adcs    u6, u6, l
+        adc     u7, u7, xzr
+
+// Perform 3 further Montgomery steps to rotate the lower half
+
+        montreds(u1,u0,u3,u2,u1, l,mc)
+        montreds(u2,u1,u0,u3,u2, l,mc)
+        montreds(u3,u2,u1,u0,u3, l,mc)
+
+// Add high and low parts, catching carry in b1
+
+        adds    u0, u0, u4
+        adcs    u1, u1, u5
+        adcs    u2, u2, u6
+        adcs    u3, u3, u7
+        cset    b1, cs
+
+// Set [mc;0;l;-1] = p_256 and form [u7,u6,u5,u4] = [b1;u3;u2;u1;u0] - p_256
+
+        mov     l, #0x00000000ffffffff
+
+        subs    u4, u0, #-1
+        sbcs    u5, u1, l
+        sbcs    u6, u2, xzr
+        sbcs    u7, u3, mc
+        sbcs    xzr, b1, xzr
+
+// Now CF is clear if the comparison carried so the original was fine
+// Otherwise take the form with p_256 subtracted.
+
+        csel    u0, u0, u4, cc
+        csel    u1, u1, u5, cc
+        csel    u2, u2, u6, cc
+        csel    u3, u3, u7, cc
+
+// Store back final result
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256.S
new file mode 100644
index 00000000000..c23bebc57c1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256.S
@@ -0,0 +1,325 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is
+// guaranteed in particular if x < p_256 initially (the "intended" case).
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+
+// bignum_montsqr_p256 is functionally equivalent to
+// unopt/bignum_montsqr_p256_base.
+// It is written in a way that
+// 1. A subset of scalar multiplications in bignum_montsqr_p256_base are carefully
+//    chosen and vectorized
+// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer.
+//    https://github.com/slothy-optimizer/slothy
+//
+// The output program of step 1. is as follows:
+//
+//        ldp x7, x3, [x1]
+//        ldr q6, [x1]
+//        ldp x9, x8, [x1, #16]
+//        ldr q18, [x1, #16]
+//        ldr q27, [x1]
+//        umull v16.2D, v27.2S, v27.2S
+//        umull2 v17.2D, v27.4S, v27.4S
+//        xtn v30.2S, v27.2D
+//        uzp2 v27.4S, v27.4S, v27.4S
+//        umull v27.2D, v27.2S, v30.2S
+//        mov x6, v16.d[0]
+//        mov x12, v16.d[1]
+//        mov x13, v17.d[0]
+//        mov x1, v17.d[1]
+//        mov x15, v27.d[0]
+//        mov x10, v27.d[1]
+//        adds x4, x6, x15, lsl #33
+//        lsr x6, x15, #31
+//        adc x15, x12, x6
+//        adds x13, x13, x10, lsl #33
+//        lsr x6, x10, #31
+//        adc x12, x1, x6
+//        mul x6, x7, x3
+//        umulh x1, x7, x3
+//        adds x5, x15, x6, lsl #1
+//        extr x6, x1, x6, #63
+//        adcs x10, x13, x6
+//        lsr x6, x1, #63
+//        adc x15, x12, x6
+//        lsl x6, x4, #32
+//        subs x13, x4, x6
+//        lsr x12, x4, #32
+//        sbc x1, x4, x12
+//        adds x6, x5, x6
+//        adcs x5, x10, x12
+//        adcs x10, x15, x13
+//        adc x15, x1, xzr
+//        lsl x13, x6, #32
+//        subs x12, x6, x13
+//        lsr x1, x6, #32
+//        sbc x6, x6, x1
+//        adds x16, x5, x13
+//        adcs x11, x10, x1
+//        adcs x2, x15, x12
+//        adc x17, x6, xzr
+//        uzp1 v30.4S, v18.4S, v6.4S
+//        rev64 v27.4S, v18.4S
+//        uzp1 v18.4S, v6.4S, v6.4S
+//        mul v27.4S, v27.4S, v6.4S
+//        uaddlp v5.2D, v27.4S
+//        shl v6.2D, v5.2D, #32
+//        umlal v6.2D, v18.2S, v30.2S
+//        mov x4, v6.d[0]
+//        mov x5, v6.d[1]
+//        umulh x10, x7, x9
+//        subs x6, x7, x3
+//        cneg x13, x6, cc
+//        csetm x12, cc
+//        subs x6, x8, x9
+//        cneg x6, x6, cc
+//        mul x1, x13, x6
+//        umulh x6, x13, x6
+//        cinv x15, x12, cc
+//        eor x12, x1, x15
+//        eor x13, x6, x15
+//        adds x1, x4, x10
+//        adc x6, x10, xzr
+//        umulh x3, x3, x8
+//        adds x1, x1, x5
+//        adcs x6, x6, x3
+//        adc x3, x3, xzr
+//        adds x6, x6, x5
+//        adc x3, x3, xzr
+//        cmn x15, #0x1
+//        adcs x12, x1, x12
+//        adcs x1, x6, x13
+//        adc x3, x3, x15
+//        adds x6, x4, x4
+//        adcs x13, x12, x12
+//        adcs x12, x1, x1
+//        adcs x1, x3, x3
+//        adc x3, xzr, xzr
+//        adds x6, x6, x16
+//        adcs x5, x13, x11
+//        adcs x10, x12, x2
+//        adcs x15, x1, x17
+//        adc x13, x3, xzr
+//        lsl x3, x6, #32
+//        subs x12, x6, x3
+//        lsr x1, x6, #32
+//        sbc x6, x6, x1
+//        adds x3, x5, x3
+//        adcs x5, x10, x1
+//        adcs x15, x15, x12
+//        adcs x13, x13, x6
+//        adc x10, xzr, xzr
+//        lsl x6, x3, #32
+//        subs x12, x3, x6
+//        lsr x1, x3, #32
+//        sbc x3, x3, x1
+//        adds x6, x5, x6
+//        adcs x15, x15, x1
+//        adcs x13, x13, x12
+//        adcs x12, x10, x3
+//        adc x1, xzr, xzr
+//        mul x3, x9, x9
+//        adds x5, x6, x3
+//        mul x6, x8, x8
+//        umulh x3, x9, x9
+//        adcs x15, x15, x3
+//        adcs x13, x13, x6
+//        umulh x3, x8, x8
+//        adcs x12, x12, x3
+//        adc x1, x1, xzr
+//        mul x6, x9, x8
+//        umulh x3, x9, x8
+//        adds x8, x6, x6
+//        adcs x9, x3, x3
+//        adc x3, xzr, xzr
+//        adds x10, x15, x8
+//        adcs x15, x13, x9
+//        adcs x13, x12, x3
+//        adcs x12, x1, xzr
+//        mov x3, #0xffffffff
+//        adds x6, x5, #0x1
+//        sbcs x8, x10, x3
+//        mov x3, #0xffffffff00000001
+//        sbcs x9, x15, xzr
+//        sbcs x1, x13, x3
+//        sbcs xzr, x12, xzr
+//        csel x6, x6, x5, cs
+//        csel x8, x8, x10, cs
+//        csel x9, x9, x15, cs
+//        csel x3, x1, x13, cs
+//        stp x6, x8, [x0]                    // @slothy:writes=buffer0
+//        stp x9, x3, [x0, #16]               // @slothy:writes=buffer16
+//        ret
+//
+// The bash script used for step 2 is as follows:
+//
+//        # Store the assembly instructions except the last 'ret' as, say, 'input.S'
+//        export OUTPUTS="[hint_buffer0,hint_buffer16]"
+//        export RESERVED_REGS="[x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]"
+//        <s2n-bignum>/tools/external/slothy.sh input.S my_out_dir
+//        # my_out_dir/3.opt.s is the optimized assembly. Its output may differ
+//        # from this file since the sequence is non-deterministically chosen.
+//        # Please add 'ret' at the end of the output assembly.
+
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256)
+        .text
+        .balign 4
+
+S2N_BN_SYMBOL(bignum_montsqr_p256):
+
+        ldr q19, [x1]
+        ldp x9, x13, [x1]
+        ldr q23, [x1, #16]
+        ldr q0, [x1]
+        ldp x1, x10, [x1, #16]
+        uzp2 v29.4S, v19.4S, v19.4S
+        xtn v4.2S, v19.2D
+        umulh x8, x9, x13
+        rev64 v20.4S, v23.4S
+        umull v16.2D, v19.2S, v19.2S
+        umull v1.2D, v29.2S, v4.2S
+        mul v20.4S, v20.4S, v0.4S
+        subs x14, x9, x13
+        umulh x15, x9, x1
+        mov x16, v16.d[1]
+        umull2 v4.2D, v19.4S, v19.4S
+        mov x4, v16.d[0]
+        uzp1 v17.4S, v23.4S, v0.4S
+        uaddlp v19.2D, v20.4S
+        lsr x7, x8, #63
+        mul x11, x9, x13
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4S, v0.4S, v0.4S
+        shl v19.2D, v19.2D, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2D, v20.2S, v17.2S
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x13, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x16, x3, x16, cs
+        csel x14, x8, x14, cs
+        csel x12, x11, x12, cs
+        csel x2, x5, x2, cs
+        stp x14, x12, [x0, #16]
+        stp x16, x2, [x0]
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256_alt.S
new file mode 100644
index 00000000000..cecc797a5bc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256_alt.S
@@ -0,0 +1,183 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_p256_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is
+// guaranteed in particular if x < p_256 initially (the "intended" case).
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256_alt)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1] and generating d4 from zero, re-using
+// d0 as a temporary internally together with "tmp". The "mc" parameter is
+// assumed to be a register whose value is 0xFFFFFFFF00000001.
+// It is fine for d4 to be the same register as d0, and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, tmp,mc)                            \
+        adds    d1, d1, d0, lsl #32 __LF                               \
+        lsr     tmp, d0, #32 __LF                                      \
+        adcs    d2, d2, tmp __LF                                       \
+        mul     tmp, d0, mc __LF                                       \
+        umulh   d4, d0, mc __LF                                        \
+        adcs    d3, d3, tmp __LF                                       \
+        adc     d4, d4, xzr
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+
+#define l x6
+#define h x7
+
+#define u0 x8
+#define u1 x9
+#define u2 x10
+#define u3 x11
+#define u4 x12
+#define u5 x13
+#define u6 x14
+
+// This one is the same as h, which is safe with this computation sequence
+
+#define u7 h
+
+// This one is the same as a3, and is used for the Montgomery constant
+// 0xFFFFFFFF00000001
+
+#define mc x5
+
+S2N_BN_SYMBOL(bignum_montsqr_p256_alt):
+
+// Load all the elements, set up an initial window [u6;...u1] = [23;03;01]
+// and chain in the addition of 02 + 12 + 13 (no carry-out is possible).
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        ldp     a0, a1, [x]
+
+        mul     u1, a0, a1
+        umulh   u2, a0, a1
+
+        ldp     a2, a3, [x, #16]
+
+        mul     u3, a0, a3
+        umulh   u4, a0, a3
+
+        mul     l, a0, a2
+        umulh   h, a0, a2
+        adds    u2, u2, l
+
+        adcs    u3, u3, h
+        mul     l, a1, a2
+        umulh   h, a1, a2
+        adc     h, h, xzr
+        adds    u3, u3, l
+
+        mul     u5, a2, a3
+        umulh   u6, a2, a3
+
+        adcs    u4, u4, h
+        mul     l, a1, a3
+        umulh   h, a1, a3
+        adc     h, h, xzr
+        adds    u4, u4, l
+
+        adcs    u5, u5, h
+        adc     u6, u6, xzr
+
+// Now just double it; this simple approach seems to work better than extr
+
+        adds    u1, u1, u1
+        adcs    u2, u2, u2
+        adcs    u3, u3, u3
+        adcs    u4, u4, u4
+        adcs    u5, u5, u5
+        adcs    u6, u6, u6
+        cset    u7, cs
+
+// Add the homogeneous terms 00 + 11 + 22 + 33
+
+        umulh   l, a0, a0
+        mul     u0, a0, a0
+        adds    u1, u1, l
+
+        mul     l, a1, a1
+        adcs    u2, u2, l
+        umulh   l, a1, a1
+        adcs    u3, u3, l
+
+
+        mul     l, a2, a2
+        adcs    u4, u4, l
+        umulh   l, a2, a2
+        adcs    u5, u5, l
+
+        mul     l, a3, a3
+        adcs    u6, u6, l
+        umulh   l, a3, a3
+        adc     u7, u7, l
+
+// Squaring complete. Perform 4 Montgomery steps to rotate the lower half
+
+        mov     mc, #0xFFFFFFFF00000001
+        montreds(u0,u3,u2,u1,u0, a0,mc)
+        montreds(u1,u0,u3,u2,u1, a0,mc)
+        montreds(u2,u1,u0,u3,u2, a0,mc)
+        montreds(u3,u2,u1,u0,u3, a0,mc)
+
+// Add high and low parts, catching carry in a0
+
+        adds    u0, u0, u4
+        adcs    u1, u1, u5
+        adcs    u2, u2, u6
+        adcs    u3, u3, u7
+        cset    a0, cs
+
+// Set [a3;0;a1;-1] = p_256 and form [u7,u6,u5,u4] = [a0;u3;u2;u1;u0] - p_256
+// Note that a3 == mc was already set above
+
+        mov     a1, #0x00000000ffffffff
+
+        subs    u4, u0, #-1
+        sbcs    u5, u1, a1
+        sbcs    u6, u2, xzr
+        sbcs    u7, u3, mc
+        sbcs    xzr, a0, xzr
+
+// Now CF is clear if the comparison carried so the original was fine
+// Otherwise take the form with p_256 subtracted.
+
+        csel    u0, u0, u4, cc
+        csel    u1, u1, u5, cc
+        csel    u2, u2, u6, cc
+        csel    u3, u3, u7, cc
+
+// Store back final result
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mux_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mux_4.S
new file mode 100644
index 00000000000..e678e652693
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mux_4.S
@@ -0,0 +1,58 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
+// Inputs p, x[4], y[4]; output z[4]
+//
+//    extern void bignum_mux_4
+//     (uint64_t p, uint64_t z[static 4],
+//      uint64_t x[static 4], uint64_t y[static 4]);
+//
+// It is assumed that all numbers x, y and z have the same size 4 digits.
+//
+// Standard ARM ABI: X0 = p, X1 = z, X2 = x, X3 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux_4)
+        .text
+        .balign 4
+
+#define p x0
+#define z x1
+#define x x2
+#define y x3
+#define a x4
+
+
+S2N_BN_SYMBOL(bignum_mux_4):
+
+cmp     p, #0                    // Set condition codes p = 0
+
+        ldr     a, [x]
+        ldr     p, [y]
+        csel    a, a, p, ne
+        str     a, [z]
+
+        ldr     a, [x, #8]
+        ldr     p, [y, #8]
+        csel    a, a, p, ne
+        str     a, [z, #8]
+
+        ldr     a, [x, #16]
+        ldr     p, [y, #16]
+        csel    a, a, p, ne
+        str     a, [z, #16]
+
+        ldr     a, [x, #24]
+        ldr     p, [y, #24]
+        csel    a, a, p, ne
+        str     a, [z, #24]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_neg_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_neg_p256.S
new file mode 100644
index 00000000000..7007362fc85
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_neg_p256.S
@@ -0,0 +1,67 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negate modulo p_256, z := (-x) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_neg_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_p256)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define p x2
+#define t x3
+
+#define d0 x4
+#define d1 x5
+#define d2 x6
+#define d3 x7
+
+
+S2N_BN_SYMBOL(bignum_neg_p256):
+
+// Load the 4 digits of x
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Set a bitmask p for the input being nonzero, so that we avoid doing
+// -0 = p_256 and hence maintain strict modular reduction
+
+        orr     t, d0, d1
+        orr     p, d2, d3
+        orr     p, p, t
+        cmp     p, #0
+        csetm   p, ne
+
+// Mask the nontrivial words of p_256 = [n3;0;n1;-1] and subtract
+
+        subs    d0, p, d0
+        and     t, p, #0x00000000ffffffff
+        sbcs    d1, t, d1
+        sbcs    d2, xzr, d2
+        and     t, p, #0xffffffff00000001
+        sbc     d3, t, d3
+
+// Write back the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+// Return
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_nonzero_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_nonzero_4.S
new file mode 100644
index 00000000000..3d6f8b36539
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_nonzero_4.S
@@ -0,0 +1,44 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// 256-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero
+// Input x[4]; output function return
+//
+//    extern uint64_t bignum_nonzero_4(uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = x, returns X0
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_nonzero_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_nonzero_4)
+        .text
+        .balign 4
+
+#define x x0
+#define a x1
+#define d x2
+#define c x3
+
+
+S2N_BN_SYMBOL(bignum_nonzero_4):
+
+// Generate a = an OR of all the words in the bignum
+
+        ldp     a, d, [x]
+        orr     a, a, d
+        ldp     c, d, [x, #16]
+        orr     c, c, d
+        orr     a, a, c
+
+// Set a standard C condition based on whether a is nonzero
+
+        cmp     a, xzr
+        cset    x0, ne
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_optneg_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_optneg_p256.S
new file mode 100644
index 00000000000..2f9260d5871
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_optneg_p256.S
@@ -0,0 +1,84 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or
+// z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+//
+//    extern void bignum_optneg_p256
+//     (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = p, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p256)
+        .text
+        .balign 4
+
+#define z x0
+#define p x1
+#define x x2
+
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define n0 x7
+#define n1 x8
+#define n2 x9
+#define n3 x10
+
+
+S2N_BN_SYMBOL(bignum_optneg_p256):
+
+// Load the 4 digits of x
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Adjust p by zeroing it if the input is zero (to avoid giving -0 = p, which
+// is not strictly reduced even though it's correct modulo p)
+
+        orr     n0, d0, d1
+        orr     n1, d2, d3
+        orr     n2, n0, n1
+        cmp     n2, #0
+        csel    p, xzr, p, eq
+
+// Load the three nonzero words of p_256 = [n3;0;n1;n0]
+
+        mov     n0, #0xffffffffffffffff
+        mov     n1, #0x00000000ffffffff
+        mov     n3, #0xffffffff00000001
+
+// Do the subtraction, which by hypothesis does not underflow
+
+        subs    n0, n0, d0
+        sbcs    n1, n1, d1
+        sbcs    n2, xzr, d2
+        sbc     n3, n3, d3
+
+// Set condition code if original x is nonzero and p was nonzero
+
+        cmp     p, #0
+
+// Hence multiplex and write back
+
+        csel    n0, n0, d0, ne
+        csel    n1, n1, d1, ne
+        csel    n2, n2, d2, ne
+        csel    n3, n3, d3, ne
+
+        stp     n0, n1, [z]
+        stp     n2, n3, [z, #16]
+
+// Return
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_sub_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_sub_p256.S
new file mode 100644
index 00000000000..5e71ca0892c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_sub_p256.S
@@ -0,0 +1,67 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo p_256, z := (x - y) mod p_256
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_sub_p256
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p256)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+#define c x3
+#define l x4
+#define d0 x5
+#define d1 x6
+#define d2 x7
+#define d3 x8
+
+
+S2N_BN_SYMBOL(bignum_sub_p256):
+
+// First just subtract the numbers as [d3; d2; d1; d0]
+// Set a mask based on (inverted) carry indicating x < y = correction is needed
+
+        ldp     d0, d1, [x]
+        ldp     l, c, [y]
+        subs    d0, d0, l
+        sbcs    d1, d1, c
+        ldp     d2, d3, [x, #16]
+        ldp     l, c, [y, #16]
+        sbcs    d2, d2, l
+        sbcs    d3, d3, c
+
+// Create a mask for the condition x < y, when we need to correct
+
+        csetm   c, cc
+
+// Now correct by adding masked p_256
+
+        adds    d0, d0, c
+        and     l, c, #0x00000000ffffffff
+        adcs    d1, d1, l
+        adcs    d2, d2, xzr
+        and     l, c, #0xffffffff00000001
+        adc     d3, d3, l
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_tomont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_tomont_p256.S
new file mode 100644
index 00000000000..37574f3087d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_tomont_p256.S
@@ -0,0 +1,116 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert to Montgomery form z := (2^256 * x) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_tomont_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256_alt)
+        .text
+        .balign 4
+
+// ----------------------------------------------------------------------------
+// Core "x |-> (2^64 * x) mod p_256" macro, with x assumed to be < p_256.
+// Input is in [d4;d3;d2;d1] and output in [d3;d2;d1;d0]
+// using d4 as well as t1, t2, t3 as temporaries.
+// ----------------------------------------------------------------------------
+
+#define modstep_p256(d4, d3,d2,d1,d0, t1,t2,t3)                             \
+/* Writing the input as z = 2^256 * h + 2^192 * l + t = 2^192 * hl + t,  */ \
+/* our quotient approximation is MIN ((hl + hl>>32 + 1)>>64) (2^64 - 1). */ \
+        subs    xzr, xzr, xzr __LF/* Set carry flag for +1 */          \
+        extr    t3, d4, d3, #32 __LF                                   \
+        adcs    xzr, d3, t3 __LF                                       \
+        lsr     t3, d4, #32 __LF                                       \
+        adcs    t3, d4, t3 __LF                                        \
+        csetm   d0, cs __LF                                            \
+        orr     t3, t3, d0 __LF                                        \
+/* First do [t2;t1] = 2^32 * q, which we use twice                       */ \
+        lsl     t1, t3, #32 __LF                                       \
+        lsr     t2, t3, #32 __LF                                       \
+/* Add 2^224 * q to sum                                                  */ \
+        adds    d3, d3, t1 __LF                                        \
+        adc     d4, d4, t2 __LF                                        \
+/* Accumulate [t2;t1;d0] = (2^96 - 1) * q                                */ \
+        subs    d0, xzr, t3 __LF                                       \
+        sbcs    t1, t1, xzr __LF                                       \
+        sbc     t2, t2, xzr __LF                                       \
+/* Subtract (2^256 + 2^192 + 2^96 - 1) * q                               */ \
+        subs    d0, xzr, d0 __LF                                       \
+        sbcs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, t3 __LF                                        \
+/* Use top word as mask to correct                                       */ \
+        adds    d0, d0, d4 __LF                                        \
+        mov     t1, #0x00000000ffffffff __LF                           \
+        and     t1, t1, d4 __LF                                        \
+        adcs    d1, d1, t1 __LF                                        \
+        adcs    d2, d2, xzr __LF                                       \
+        mov     t1, #0xffffffff00000001 __LF                           \
+        and     t1, t1, d4 __LF                                        \
+        adc     d3, d3, t1
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define d4 x6
+
+#define t0 x1
+#define t1 x7
+#define t2 x8
+#define t3 x9
+
+S2N_BN_SYMBOL(bignum_tomont_p256):
+
+S2N_BN_SYMBOL(bignum_tomont_p256_alt):
+
+// Load the input
+
+        ldp     d0, d1, [x1]
+        ldp     d2, d3, [x1, #16]
+
+// Do an initial reduction to make sure this is < p_256, using just
+// a copy of the bignum_mod_p256_4 code. This is needed to set up the
+// invariant "input < p_256" for the main modular reduction steps.
+
+        mov     t0, #0xffffffffffffffff
+        mov     t1, #0x00000000ffffffff
+        mov     t3, #0xffffffff00000001
+        subs    t0, d0, t0
+        sbcs    t1, d1, t1
+        sbcs    t2, d2, xzr
+        sbcs    t3, d3, t3
+        csel    d0, d0, t0, cc
+        csel    d1, d1, t1, cc
+        csel    d2, d2, t2, cc
+        csel    d3, d3, t3, cc
+
+// Successively multiply by 2^64 and reduce
+
+        modstep_p256(d3,d2,d1,d0,d4, t1,t2,t3)
+        modstep_p256(d2,d1,d0,d4,d3, t1,t2,t3)
+        modstep_p256(d1,d0,d4,d3,d2, t1,t2,t3)
+        modstep_p256(d0,d4,d3,d2,d1, t1,t2,t3)
+
+// Store the result and return
+
+        stp     d1, d2, [x0]
+        stp     d3, d4, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_triple_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_triple_p256.S
new file mode 100644
index 00000000000..ad2ee2c223b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_triple_p256.S
@@ -0,0 +1,112 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Triple modulo p_256, z := (3 * x) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_triple_p256
+//      (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The input x can be any 4-digit bignum, not necessarily reduced modulo p_256,
+// and the result is always fully reduced, i.e. z = (3 * x) mod p_256.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define h x6
+
+// Slightly offset aliases for the d_i for readability.
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+
+// More aliases for the same thing at different stages
+
+#define q x7
+#define c x7
+
+// Other temporary variables
+
+#define t0 x8
+#define t1 x9
+
+
+S2N_BN_SYMBOL(bignum_triple_p256):
+
+S2N_BN_SYMBOL(bignum_triple_p256_alt):
+
+// Load the inputs
+
+        ldp     a0, a1, [x]
+        ldp     a2, a3, [x, #16]
+
+// First do the multiplication by 3, getting z = [h; d3; ...; d0]
+
+        lsl     d0, a0, #1
+        adds    d0, d0, a0
+        extr    d1, a1, a0, #63
+        adcs    d1, d1, a1
+        extr    d2, a2, a1, #63
+        adcs    d2, d2, a2
+        extr    d3, a3, a2, #63
+        adcs    d3, d3, a3
+        lsr     h, a3, #63
+        adc     h, h, xzr
+
+// For this limited range a simple quotient estimate of q = h + 1 works, where
+// h = floor(z / 2^256). Then -p_256 <= z - q * p_256 < p_256, so we just need
+// to subtract q * p_256 and then if that's negative, add back p_256.
+
+        add     q, h, #1
+
+// Initial subtraction of z - q * p_256, with bitmask c for the carry
+
+        lsl     t1, q, #32
+        adds    d3, d3, t1
+        adc     h, h, xzr
+        sub     t0, xzr, q
+        sub     t1, t1, #1
+        subs    d0, d0, t0
+        sbcs    d1, d1, t1
+        sbcs    d2, d2, xzr
+        sbcs    d3, d3, q
+        sbc     c, h, q
+
+// Use the bitmask c for final masked addition of p_256.
+
+        adds    d0, d0, c
+        mov     t0, #0x00000000ffffffff
+        and     t0, t0, c
+        adcs    d1, d1, t0
+        adcs    d2, d2, xzr
+        neg     t1, t0
+        adc     d3, d3, t1
+
+// Finally store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd.S
new file mode 100644
index 00000000000..548da6eb568
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd.S
@@ -0,0 +1,3160 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+// This is functionally equivalent to p256_montjadd in unopt/p256_montjadd.S.
+// This is the result of doing the following sequence of optimizations:
+//   1. Function inlining
+//   2. Eliminating redundant load/store instructions
+//   3. Folding (add addr, const) + load/store
+// Function inlining is done manually. The second and third optimizations are
+// done by a script.
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd)
+
+        .text
+        .balign 4
+
+#define NUMSIZE 32
+#define NSPACE (NUMSIZE*7)
+
+S2N_BN_SYMBOL(p256_montjadd):
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+        stp     x27, x30, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+        mov x21, x0
+        mov x22, x1
+        mov x23, x2
+        mov x0, sp
+        ldr q19, [x22, #64]
+        ldp x9, x13, [x22, #64]
+        ldr q23, [x22, #80]
+        ldr q0, [x22, #64]
+        ldp x1, x10, [x22, #80]
+        uzp2 v29.4s, v19.4s, v19.4s
+        xtn v4.2s, v19.2d
+        umulh x8, x9, x13
+        rev64 v20.4s, v23.4s
+        umull v16.2d, v19.2s, v19.2s
+        umull v1.2d, v29.2s, v4.2s
+        mul v20.4s, v20.4s, v0.4s
+        subs x14, x9, x13
+        umulh x15, x9, x1
+        mov x16, v16.d[1]
+        umull2 v4.2d, v19.4s, v19.4s
+        mov x4, v16.d[0]
+        uzp1 v17.4s, v23.4s, v0.4s
+        uaddlp v19.2d, v20.4s
+        lsr x7, x8, #63
+        mul x11, x9, x13
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4s, v0.4s, v0.4s
+        shl v19.2d, v19.2d, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2d, v20.2s, v17.2s
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x13, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x19, x3, x16, cs
+        csel x14, x8, x14, cs
+        csel x12, x11, x12, cs
+        csel x20, x5, x2, cs
+        stp x14, x12, [x0, #16]
+        stp x19, x20, [x0]
+        ldr q19, [x23, #64]
+        ldp x9, x13, [x23, #64]
+        ldr q23, [x23, #80]
+        ldr q0, [x23, #64]
+        ldp x1, x10, [x23, #80]
+        uzp2 v29.4s, v19.4s, v19.4s
+        xtn v4.2s, v19.2d
+        umulh x8, x9, x13
+        rev64 v20.4s, v23.4s
+        umull v16.2d, v19.2s, v19.2s
+        umull v1.2d, v29.2s, v4.2s
+        mul v20.4s, v20.4s, v0.4s
+        subs x14, x9, x13
+        umulh x15, x9, x1
+        mov x16, v16.d[1]
+        umull2 v4.2d, v19.4s, v19.4s
+        mov x4, v16.d[0]
+        uzp1 v17.4s, v23.4s, v0.4s
+        uaddlp v19.2d, v20.4s
+        lsr x7, x8, #63
+        mul x11, x9, x13
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4s, v0.4s, v0.4s
+        shl v19.2d, v19.2d, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2d, v20.2s, v17.2s
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x13, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x16, x3, x16, cs
+        csel x14, x8, x14, cs
+        csel x12, x11, x12, cs
+        csel x2, x5, x2, cs
+        stp x14, x12, [sp, #176]
+        stp x16, x2, [sp, #160]
+        ldr q20, [x22, #32]
+        ldp x7, x17, [x23, #64]
+        ldr q0, [x23, #64]
+        ldp x6, x10, [x22, #32]
+        ldp x11, x15, [x23, #80]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x7
+        ldr q20, [x22, #48]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [x23, #80]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [x22, #48]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x24, x3, x13
+        adcs x25, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x24
+        adcs x15, x16, x25
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x11, x11, x13
+        and x1, x1, x13
+        adcs x4, x4, x1
+        and x1, x12, x13
+        stp x11, x4, [sp, #192]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [sp, #208]
+        ldr q20, [x23, #32]
+        ldp x7, x17, [x22, #64]
+        ldr q0, [x22, #64]
+        ldp x6, x10, [x23, #32]
+        ldp x11, x15, [x22, #80]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x7
+        ldr q20, [x23, #48]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [x22, #80]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [x23, #48]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x24, x3, x13
+        adcs x25, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x24
+        adcs x15, x16, x25
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x24, x11, x13
+        and x1, x1, x13
+        adcs x25, x4, x1
+        and x1, x12, x13
+        stp x24, x25, [sp, #32]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [sp, #48]
+        mov x1, sp
+        ldr q20, [x23, #0]
+        ldr q0, [x1]
+        ldp x6, x10, [x23, #0]
+        ldp x11, x15, [x1, #16]
+        rev64 v16.4s, v20.4s
+        subs x4, x19, x20
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x20, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x19
+        ldr q20, [x23, #16]
+        sbcs x5, x15, x20
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x19, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [x1, #16]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [x23, #16]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x19, x3, x13
+        adcs x20, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x19
+        adcs x15, x16, x20
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x11, x11, x13
+        and x1, x1, x13
+        adcs x4, x4, x1
+        and x1, x12, x13
+        stp x11, x4, [sp, #64]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [sp, #80]
+        ldr q20, [x22, #0]
+        ldp x7, x17, [sp, #160]
+        ldr q0, [sp, #160]
+        ldp x6, x10, [x22, #0]
+        ldp x11, x15, [sp, #176]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x7
+        ldr q20, [x22, #16]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #176]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [x22, #16]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x19, x3, x13
+        adcs x20, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x19
+        adcs x15, x16, x20
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x11, x11, x13
+        and x1, x1, x13
+        adcs x4, x4, x1
+        and x1, x12, x13
+        stp x11, x4, [sp, #128]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [sp, #144]
+        mov x1, sp
+        ldr q20, [sp, #32]
+        ldp x7, x17, [x1]
+        ldr q0, [x1]
+        ldp x11, x15, [x1, #16]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x25
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x7
+        ldr q20, [sp, #48]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x24
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x25, x24
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [x1, #16]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [sp, #48]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x24, x7
+        sbcs x9, x25, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x19, x3, x13
+        adcs x20, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x24, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x25, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x19
+        adcs x15, x16, x20
+        eor x5, x17, x4
+        adcs x9, x1, x24
+        eor x1, x10, x5
+        adcs x16, x2, x25
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x19, x11, x13
+        and x1, x1, x13
+        adcs x20, x4, x1
+        and x1, x12, x13
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [sp, #48]
+        ldr q20, [sp, #192]
+        ldp x7, x17, [sp, #160]
+        ldr q0, [sp, #160]
+        ldp x6, x10, [sp, #192]
+        ldp x11, x15, [sp, #176]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x7
+        ldr q20, [sp, #208]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #176]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [sp, #208]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x24, x3, x13
+        adcs x25, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x24
+        adcs x15, x16, x25
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x9, x11, x13
+        and x1, x1, x13
+        adcs x10, x4, x1
+        and x1, x12, x13
+        stp x9, x10, [sp, #192]
+        adcs x11, x7, xzr
+        adc x12, x17, x1
+        stp x11, x12, [sp, #208]
+        ldp x5, x6, [sp, #64]
+        ldp x4, x3, [sp, #128]
+        subs x5, x5, x4
+        sbcs x6, x6, x3
+        ldp x7, x8, [sp, #80]
+        ldp x4, x3, [sp, #144]
+        sbcs x7, x7, x4
+        sbcs x8, x8, x3
+        csetm x3, cc
+        adds x13, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x24, x6, x4
+        adcs x25, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x26, x8, x4
+        stp x13, x24, [sp, #160]
+        stp x25, x26, [sp, #176]
+        subs x5, x19, x9
+        sbcs x6, x20, x10
+        ldp x7, x8, [sp, #48]
+        sbcs x7, x7, x11
+        sbcs x8, x8, x12
+        csetm x3, cc
+        adds x19, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x20, x6, x4
+        adcs x7, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x8, x8, x4
+        stp x19, x20, [sp, #32]
+        stp x7, x8, [sp, #48]
+        ldr q19, [sp, #160]
+        ldr q23, [sp, #176]
+        ldr q0, [sp, #160]
+        uzp2 v29.4s, v19.4s, v19.4s
+        xtn v4.2s, v19.2d
+        umulh x8, x13, x24
+        rev64 v20.4s, v23.4s
+        umull v16.2d, v19.2s, v19.2s
+        umull v1.2d, v29.2s, v4.2s
+        mul v20.4s, v20.4s, v0.4s
+        subs x14, x13, x24
+        umulh x15, x13, x25
+        mov x16, v16.d[1]
+        umull2 v4.2d, v19.4s, v19.4s
+        mov x4, v16.d[0]
+        uzp1 v17.4s, v23.4s, v0.4s
+        uaddlp v19.2d, v20.4s
+        lsr x7, x8, #63
+        mul x11, x13, x24
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x26, x25
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4s, v0.4s, v0.4s
+        shl v19.2d, v19.2d, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2d, v20.2s, v17.2s
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x24, x26
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x25, x26
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x25, x26
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x26, x26
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x26, x26
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x25, x25
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x25, x25
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x24, x3, x16, cs
+        csel x25, x8, x14, cs
+        csel x26, x11, x12, cs
+        csel x27, x5, x2, cs
+        stp x25, x26, [sp, #112]
+        stp x24, x27, [sp, #96]
+        mov x0, sp
+        ldr q19, [sp, #32]
+        ldr q23, [sp, #48]
+        ldr q0, [sp, #32]
+        ldp x1, x10, [sp, #48]
+        uzp2 v29.4s, v19.4s, v19.4s
+        xtn v4.2s, v19.2d
+        umulh x8, x19, x20
+        rev64 v20.4s, v23.4s
+        umull v16.2d, v19.2s, v19.2s
+        umull v1.2d, v29.2s, v4.2s
+        mul v20.4s, v20.4s, v0.4s
+        subs x14, x19, x20
+        umulh x15, x19, x1
+        mov x16, v16.d[1]
+        umull2 v4.2d, v19.4s, v19.4s
+        mov x4, v16.d[0]
+        uzp1 v17.4s, v23.4s, v0.4s
+        uaddlp v19.2d, v20.4s
+        lsr x7, x8, #63
+        mul x11, x19, x20
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4s, v0.4s, v0.4s
+        shl v19.2d, v19.2d, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2d, v20.2s, v17.2s
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x20, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x16, x3, x16, cs
+        csel x14, x8, x14, cs
+        csel x12, x11, x12, cs
+        csel x2, x5, x2, cs
+        stp x14, x12, [x0, #16]
+        stp x16, x2, [x0]
+        ldr q20, [sp, #128]
+        ldr q0, [sp, #96]
+        ldp x6, x10, [sp, #128]
+        rev64 v16.4s, v20.4s
+        subs x4, x24, x27
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x27, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x25, x24
+        ldr q20, [sp, #144]
+        sbcs x5, x26, x27
+        ngc x17, xzr
+        subs x8, x25, x26
+        uaddlp v27.2d, v16.4s
+        umulh x4, x24, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #112]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [sp, #144]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x19, x3, x13
+        adcs x20, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x25, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x26, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x19
+        adcs x15, x16, x20
+        eor x5, x17, x4
+        adcs x9, x1, x25
+        eor x1, x10, x5
+        adcs x16, x2, x26
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x19, x11, x13
+        and x1, x1, x13
+        adcs x20, x4, x1
+        and x1, x12, x13
+        stp x19, x20, [sp, #128]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [sp, #144]
+        ldr q20, [sp, #64]
+        ldr q0, [sp, #96]
+        ldp x6, x10, [sp, #64]
+        ldp x11, x15, [sp, #112]
+        rev64 v16.4s, v20.4s
+        subs x4, x24, x27
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x27, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x24
+        ldr q20, [sp, #80]
+        sbcs x5, x15, x27
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x24, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #112]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [sp, #80]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x24, x3, x13
+        adcs x25, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x24
+        adcs x15, x16, x25
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x9, x11, x13
+        and x1, x1, x13
+        adcs x10, x4, x1
+        and x1, x12, x13
+        stp x9, x10, [sp, #64]
+        adcs x11, x7, xzr
+        adc x12, x17, x1
+        stp x11, x12, [sp, #80]
+        mov x0, sp
+        mov x1, sp
+        ldp x5, x6, [x1]
+        subs x5, x5, x19
+        sbcs x6, x6, x20
+        ldp x7, x8, [x1, #16]
+        ldp x4, x3, [sp, #144]
+        sbcs x7, x7, x4
+        sbcs x8, x8, x3
+        csetm x3, cc
+        adds x24, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x25, x6, x4
+        adcs x7, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x8, x8, x4
+        stp x7, x8, [x0, #16]
+        subs x5, x9, x19
+        sbcs x6, x10, x20
+        ldp x4, x3, [sp, #144]
+        sbcs x7, x11, x4
+        sbcs x8, x12, x3
+        csetm x3, cc
+        adds x5, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x6, x6, x4
+        adcs x7, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x8, x8, x4
+        stp x5, x6, [sp, #96]
+        stp x7, x8, [sp, #112]
+        ldr q20, [x22, #64]
+        ldp x7, x17, [sp, #160]
+        ldr q0, [sp, #160]
+        ldp x6, x10, [x22, #64]
+        ldp x11, x15, [sp, #176]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x7
+        ldr q20, [x22, #80]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #176]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [x22, #80]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x19, x3, x13
+        adcs x20, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x19
+        adcs x15, x16, x20
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x11, x11, x13
+        and x1, x1, x13
+        adcs x4, x4, x1
+        and x1, x12, x13
+        stp x11, x4, [sp, #160]
+        adcs x19, x7, xzr
+        adc x20, x17, x1
+        stp x19, x20, [sp, #176]
+        mov x0, sp
+        mov x1, sp
+        ldp x4, x3, [sp, #64]
+        subs x5, x24, x4
+        sbcs x6, x25, x3
+        ldp x7, x8, [x1, #16]
+        ldp x4, x3, [sp, #80]
+        sbcs x7, x7, x4
+        sbcs x8, x8, x3
+        csetm x3, cc
+        adds x9, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x10, x6, x4
+        adcs x11, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x3, x8, x4
+        stp x9, x10, [x0]
+        stp x11, x3, [x0, #16]
+        ldp x5, x6, [sp, #128]
+        subs x5, x5, x9
+        sbcs x6, x6, x10
+        ldp x7, x8, [sp, #144]
+        sbcs x7, x7, x11
+        sbcs x8, x8, x3
+        csetm x3, cc
+        adds x5, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x6, x6, x4
+        adcs x7, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x8, x8, x4
+        stp x5, x6, [sp, #128]
+        stp x7, x8, [sp, #144]
+        ldr q20, [sp, #192]
+        ldp x7, x17, [sp, #96]
+        ldr q0, [sp, #96]
+        ldp x6, x10, [sp, #192]
+        ldp x11, x15, [sp, #112]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x7
+        ldr q20, [sp, #208]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #112]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [sp, #208]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x24, x3, x13
+        adcs x25, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x24
+        adcs x15, x16, x25
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x11, x11, x13
+        and x1, x1, x13
+        adcs x4, x4, x1
+        and x1, x12, x13
+        stp x11, x4, [sp, #96]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [sp, #112]
+        ldr q20, [x23, #64]
+        ldp x7, x17, [sp, #160]
+        ldr q0, [sp, #160]
+        ldp x6, x10, [x23, #64]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x19, x7
+        ldr q20, [x23, #80]
+        sbcs x5, x20, x17
+        ngc x17, xzr
+        subs x8, x19, x20
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #176]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [x23, #80]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x19, x3, x13
+        adcs x20, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x24, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x25, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x19
+        adcs x15, x16, x20
+        eor x5, x17, x4
+        adcs x9, x1, x24
+        eor x1, x10, x5
+        adcs x16, x2, x25
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x19, x11, x13
+        and x1, x1, x13
+        adcs x20, x4, x1
+        and x1, x12, x13
+        stp x19, x20, [sp, #160]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [sp, #176]
+        ldr q20, [sp, #128]
+        ldp x7, x17, [sp, #32]
+        ldr q0, [sp, #32]
+        ldp x6, x10, [sp, #128]
+        ldp x11, x15, [sp, #48]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x7
+        ldr q20, [sp, #144]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #48]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [sp, #144]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x24, x3, x13
+        adcs x25, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x24
+        adcs x15, x16, x25
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x5, x11, x13
+        and x1, x1, x13
+        adcs x6, x4, x1
+        and x1, x12, x13
+        adcs x7, x7, xzr
+        adc x9, x17, x1
+        ldp x4, x3, [sp, #96]
+        subs x5, x5, x4
+        sbcs x6, x6, x3
+        ldp x4, x3, [sp, #112]
+        sbcs x7, x7, x4
+        sbcs x8, x9, x3
+        csetm x3, cc
+        adds x15, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x24, x6, x4
+        adcs x25, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x26, x8, x4
+        stp x15, x24, [sp, #128]
+        stp x25, x26, [sp, #144]
+        ldp x0, x1, [x22, #64]
+        ldp x2, x3, [x22, #80]
+        orr x12, x0, x1
+        orr x13, x2, x3
+        orr x12, x12, x13
+        cmp x12, xzr
+        cset x12, ne
+        ldp x4, x5, [x23, #64]
+        ldp x6, x7, [x23, #80]
+        orr x13, x4, x5
+        orr x14, x6, x7
+        orr x13, x13, x14
+        cmp x13, xzr
+        cset x13, ne
+        cmp x13, x12
+        csel x8, x0, x19, cc
+        csel x9, x1, x20, cc
+        csel x8, x4, x8, hi
+        csel x9, x5, x9, hi
+        ldp x10, x11, [sp, #176]
+        csel x10, x2, x10, cc
+        csel x11, x3, x11, cc
+        csel x10, x6, x10, hi
+        csel x11, x7, x11, hi
+        ldp x12, x13, [x22]
+        ldp x0, x1, [sp]
+        csel x0, x12, x0, cc
+        csel x1, x13, x1, cc
+        ldp x12, x13, [x23]
+        csel x0, x12, x0, hi
+        csel x1, x13, x1, hi
+        ldp x12, x13, [x22, #16]
+        ldp x2, x3, [sp, #16]
+        csel x2, x12, x2, cc
+        csel x3, x13, x3, cc
+        ldp x12, x13, [x23, #16]
+        csel x2, x12, x2, hi
+        csel x3, x13, x3, hi
+        ldp x12, x13, [x22, #32]
+        csel x4, x12, x15, cc
+        csel x5, x13, x24, cc
+        ldp x12, x13, [x23, #32]
+        csel x4, x12, x4, hi
+        csel x5, x13, x5, hi
+        ldp x12, x13, [x22, #48]
+        csel x6, x12, x25, cc
+        csel x7, x13, x26, cc
+        ldp x12, x13, [x23, #48]
+        csel x6, x12, x6, hi
+        csel x7, x13, x7, hi
+        stp x0, x1, [x21]
+        stp x2, x3, [x21, #16]
+        stp x4, x5, [x21, #32]
+        stp x6, x7, [x21, #48]
+        stp x8, x9, [x21, #64]
+        stp x10, x11, [x21, #80]
+
+        add     sp, sp, NSPACE
+        ldp     x27, x30, [sp], 16
+        ldp     x25, x26, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd_alt.S
new file mode 100644
index 00000000000..4849a2857a2
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd_alt.S
@@ -0,0 +1,549 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x15
+#define input_x x16
+#define input_y x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+#define z_2 input_y, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define x1a sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define z2sq sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define y1a sp, #(NUMSIZE*6)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds to bignum_montmul_p256_alt except registers
+
+#define montmul_p256(P0,P1,P2)                  \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x10, #0xffffffff00000001 __LF      \
+        adds    x13, x13, x12, lsl #32 __LF        \
+        lsr     x11, x12, #32 __LF                 \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x12, x10 __LF                 \
+        umulh   x12, x12, x10 __LF                 \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x12, x12, xzr __LF                 \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        adds    x14, x14, x13, lsl #32 __LF        \
+        lsr     x11, x13, #32 __LF                 \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x13, x10 __LF                 \
+        umulh   x13, x13, x10 __LF                 \
+        adcs    x12, x12, x11 __LF                 \
+        adc     x13, x13, xzr __LF                 \
+        adds    x0, x0, x14, lsl #32 __LF          \
+        lsr     x11, x14, #32 __LF                 \
+        adcs    x12, x12, x11 __LF                 \
+        mul     x11, x14, x10 __LF                 \
+        umulh   x14, x14, x10 __LF                 \
+        adcs    x13, x13, x11 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x12, x12, x0, lsl #32 __LF         \
+        lsr     x11, x0, #32 __LF                  \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x0, x10 __LF                  \
+        umulh   x0, x0, x10 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        adc     x0, x0, xzr __LF                   \
+        adds    x12, x12, x1 __LF                  \
+        adcs    x13, x13, x3 __LF                  \
+        adcs    x14, x14, x4 __LF                  \
+        adcs    x0, x0, x5 __LF                    \
+        cset    x8, cs __LF                        \
+        mov     x11, #0xffffffff __LF              \
+        adds    x1, x12, #0x1 __LF                 \
+        sbcs    x3, x13, x11 __LF                  \
+        sbcs    x4, x14, xzr __LF                  \
+        sbcs    x5, x0, x10 __LF                   \
+        sbcs    xzr, x8, xzr __LF                  \
+        csel    x12, x12, x1, cc __LF              \
+        csel    x13, x13, x3, cc __LF              \
+        csel    x14, x14, x4, cc __LF              \
+        csel    x0, x0, x5, cc __LF                \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16]
+
+// Corresponds exactly to bignum_montsqr_p256_alt
+
+#define montsqr_p256(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, cs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        adds    x9, x9, x8, lsl #32 __LF           \
+        lsr     x3, x8, #32 __LF                   \
+        adcs    x10, x10, x3 __LF                  \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x8, x3 __LF                    \
+        umulh   x8, x8, x3 __LF                    \
+        adcs    x11, x11, x2 __LF                  \
+        adc     x8, x8, xzr __LF                   \
+        adds    x10, x10, x9, lsl #32 __LF         \
+        lsr     x3, x9, #32 __LF                   \
+        adcs    x11, x11, x3 __LF                  \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x9, x3 __LF                    \
+        umulh   x9, x9, x3 __LF                    \
+        adcs    x8, x8, x2 __LF                    \
+        adc     x9, x9, xzr __LF                   \
+        adds    x11, x11, x10, lsl #32 __LF        \
+        lsr     x3, x10, #32 __LF                  \
+        adcs    x8, x8, x3 __LF                    \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x10, x3 __LF                   \
+        umulh   x10, x10, x3 __LF                  \
+        adcs    x9, x9, x2 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        adds    x8, x8, x11, lsl #32 __LF          \
+        lsr     x3, x11, #32 __LF                  \
+        adcs    x9, x9, x3 __LF                    \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x11, x3 __LF                   \
+        umulh   x11, x11, x3 __LF                  \
+        adcs    x10, x10, x2 __LF                  \
+        adc     x11, x11, xzr __LF                 \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x2, cs __LF                        \
+        mov     x3, #0xffffffff __LF               \
+        mov     x5, #0xffffffff00000001 __LF       \
+        adds    x12, x8, #0x1 __LF                 \
+        sbcs    x13, x9, x3 __LF                   \
+        sbcs    x14, x10, xzr __LF                 \
+        sbcs    x7, x11, x5 __LF                   \
+        sbcs    xzr, x2, xzr __LF                  \
+        csel    x8, x8, x12, cc __LF               \
+        csel    x9, x9, x13, cc __LF               \
+        csel    x10, x10, x14, cc __LF             \
+        csel    x11, x11, x7, cc __LF              \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe).
+
+#define amontsqr_p256(P0,P1)                    \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, cs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        adds    x9, x9, x8, lsl #32 __LF           \
+        lsr     x3, x8, #32 __LF                   \
+        adcs    x10, x10, x3 __LF                  \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x8, x3 __LF                    \
+        umulh   x8, x8, x3 __LF                    \
+        adcs    x11, x11, x2 __LF                  \
+        adc     x8, x8, xzr __LF                   \
+        adds    x10, x10, x9, lsl #32 __LF         \
+        lsr     x3, x9, #32 __LF                   \
+        adcs    x11, x11, x3 __LF                  \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x9, x3 __LF                    \
+        umulh   x9, x9, x3 __LF                    \
+        adcs    x8, x8, x2 __LF                    \
+        adc     x9, x9, xzr __LF                   \
+        adds    x11, x11, x10, lsl #32 __LF        \
+        lsr     x3, x10, #32 __LF                  \
+        adcs    x8, x8, x3 __LF                    \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x10, x3 __LF                   \
+        umulh   x10, x10, x3 __LF                  \
+        adcs    x9, x9, x2 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        adds    x8, x8, x11, lsl #32 __LF          \
+        lsr     x3, x11, #32 __LF                  \
+        adcs    x9, x9, x3 __LF                    \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x11, x3 __LF                   \
+        umulh   x11, x11, x3 __LF                  \
+        adcs    x10, x10, x2 __LF                  \
+        adc     x11, x11, xzr __LF                 \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        mov     x2, #0xffffffffffffffff __LF       \
+        csel    x2, xzr, x2, cc __LF               \
+        mov     x3, #0xffffffff __LF               \
+        csel    x3, xzr, x3, cc __LF               \
+        mov     x5, #0xffffffff00000001 __LF       \
+        csel    x5, xzr, x5, cc __LF               \
+        subs    x8, x8, x2 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, x5 __LF                  \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, cc __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        mov     x4, #0xffffffff __LF               \
+        and     x4, x4, x3 __LF                    \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, xzr __LF                   \
+        mov     x4, #0xffffffff00000001 __LF       \
+        and     x4, x4, x3 __LF                    \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(p256_montjadd_alt):
+
+// Make room on stack for temporary variables
+// Move the input arguments to stable places
+
+        sub     sp, sp, NSPACE
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 12 * multiply + 4 * square + 7 * subtract
+
+        amontsqr_p256(z1sq,z_1)
+        amontsqr_p256(z2sq,z_2)
+
+        montmul_p256(y1a,z_2,y_1)
+        montmul_p256(y2a,z_1,y_2)
+
+        montmul_p256(x2a,z1sq,x_2)
+        montmul_p256(x1a,z2sq,x_1)
+        montmul_p256(y2a,z1sq,y2a)
+        montmul_p256(y1a,z2sq,y1a)
+
+        sub_p256(xd,x2a,x1a)
+        sub_p256(yd,y2a,y1a)
+
+        amontsqr_p256(zz,xd)
+        montsqr_p256(ww,yd)
+
+        montmul_p256(zzx1,zz,x1a)
+        montmul_p256(zzx2,zz,x2a)
+
+        sub_p256(resx,ww,zzx1)
+        sub_p256(t1,zzx2,zzx1)
+
+        montmul_p256(xd,xd,z_1)
+
+        sub_p256(resx,resx,zzx2)
+
+        sub_p256(t2,zzx1,resx)
+
+        montmul_p256(t1,t1,y1a)
+        montmul_p256(resz,xd,z_2)
+        montmul_p256(t2,yd,t2)
+
+        sub_p256(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
+// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne
+
+        ldp     x4, x5, [z_2]
+        ldp     x6, x7, [z_2+16]
+
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne
+
+        cmp     x13, x12
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        ldp     x8, x9, [resz]
+        csel    x8, x0, x8, lo
+        csel    x9, x1, x9, lo
+        csel    x8, x4, x8, hi
+        csel    x9, x5, x9, hi
+        ldp     x10, x11, [resz+16]
+        csel    x10, x2, x10, lo
+        csel    x11, x3, x11, lo
+        csel    x10, x6, x10, hi
+        csel    x11, x7, x11, hi
+
+        ldp     x12, x13, [x_1]
+        ldp     x0, x1, [resx]
+        csel    x0, x12, x0, lo
+        csel    x1, x13, x1, lo
+        ldp     x12, x13, [x_2]
+        csel    x0, x12, x0, hi
+        csel    x1, x13, x1, hi
+
+        ldp     x12, x13, [x_1+16]
+        ldp     x2, x3, [resx+16]
+        csel    x2, x12, x2, lo
+        csel    x3, x13, x3, lo
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x12, x2, hi
+        csel    x3, x13, x3, hi
+
+        ldp     x12, x13, [y_1]
+        ldp     x4, x5, [resy]
+        csel    x4, x12, x4, lo
+        csel    x5, x13, x5, lo
+        ldp     x12, x13, [y_2]
+        csel    x4, x12, x4, hi
+        csel    x5, x13, x5, hi
+
+        ldp     x12, x13, [y_1+16]
+        ldp     x6, x7, [resy+16]
+        csel    x6, x12, x6, lo
+        csel    x7, x13, x7, lo
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x12, x6, hi
+        csel    x7, x13, x7, hi
+
+// Finally store back the multiplexed values
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore registers and return
+
+        add     sp, sp, NSPACE
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble.S
new file mode 100644
index 00000000000..6988a9439e3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble.S
@@ -0,0 +1,1550 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjdouble
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+// This is functionally equivalent to p256_montjdouble in unopt/p256_montjdouble.S.
+// This is the result of doing the following sequence of optimizations:
+//   1. Function inlining
+//   2. Eliminating redundant load/store instructions
+//   3. Folding (add addr, const) + load/store
+// Function inlining is done manually. The second and third optimizations are
+// done by a script.
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble)
+        .text
+        .balign 4
+
+#define NUMSIZE 32
+#define NSPACE #(NUMSIZE*6)
+
+S2N_BN_SYMBOL(p256_montjdouble):
+
+        sub     sp, sp, NSPACE+80
+        stp     x19, x20, [sp, NSPACE]
+        stp     x21, x22, [sp, NSPACE+16]
+        stp     x23, x24, [sp, NSPACE+32]
+        stp     x25, x26, [sp, NSPACE+48]
+        stp     x27, xzr, [sp, NSPACE+64]
+
+        mov x19, x0
+        mov x20, x1
+        mov x0, sp
+        ldr q19, [x20, #64]
+        ldp x9, x13, [x20, #64]
+        ldr q23, [x20, #80]
+        ldr q0, [x20, #64]
+        ldp x1, x10, [x20, #80]
+        uzp2 v29.4s, v19.4s, v19.4s
+        xtn v4.2s, v19.2d
+        umulh x8, x9, x13
+        rev64 v20.4s, v23.4s
+        umull v16.2d, v19.2s, v19.2s
+        umull v1.2d, v29.2s, v4.2s
+        mul v20.4s, v20.4s, v0.4s
+        subs x14, x9, x13
+        umulh x15, x9, x1
+        mov x16, v16.d[1]
+        umull2 v4.2d, v19.4s, v19.4s
+        mov x4, v16.d[0]
+        uzp1 v17.4s, v23.4s, v0.4s
+        uaddlp v19.2d, v20.4s
+        lsr x7, x8, #63
+        mul x11, x9, x13
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4s, v0.4s, v0.4s
+        shl v19.2d, v19.2d, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2d, v20.2s, v17.2s
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x13, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x21, x3, x16, cs
+        csel x22, x8, x14, cs
+        csel x23, x11, x12, cs
+        csel x24, x5, x2, cs
+        stp x22, x23, [x0, #16]
+        stp x21, x24, [x0]
+        ldr q19, [x20, #32]
+        ldp x9, x13, [x20, #32]
+        ldr q23, [x20, #48]
+        ldr q0, [x20, #32]
+        ldp x1, x10, [x20, #48]
+        uzp2 v29.4s, v19.4s, v19.4s
+        xtn v4.2s, v19.2d
+        umulh x8, x9, x13
+        rev64 v20.4s, v23.4s
+        umull v16.2d, v19.2s, v19.2s
+        umull v1.2d, v29.2s, v4.2s
+        mul v20.4s, v20.4s, v0.4s
+        subs x14, x9, x13
+        umulh x15, x9, x1
+        mov x16, v16.d[1]
+        umull2 v4.2d, v19.4s, v19.4s
+        mov x4, v16.d[0]
+        uzp1 v17.4s, v23.4s, v0.4s
+        uaddlp v19.2d, v20.4s
+        lsr x7, x8, #63
+        mul x11, x9, x13
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4s, v0.4s, v0.4s
+        shl v19.2d, v19.2d, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2d, v20.2s, v17.2s
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x13, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x16, x3, x16, cs
+        csel x14, x8, x14, cs
+        csel x12, x11, x12, cs
+        csel x2, x5, x2, cs
+        stp x14, x12, [sp, #48]
+        stp x16, x2, [sp, #32]
+        ldp x5, x6, [x20, #0]
+        subs x5, x5, x21
+        sbcs x6, x6, x24
+        ldp x7, x8, [x20, #16]
+        sbcs x7, x7, x22
+        sbcs x8, x8, x23
+        csetm x3, cc
+        adds x10, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x25, x6, x4
+        adcs x26, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x27, x8, x4
+        stp x10, x25, [sp, #96]
+        stp x26, x27, [sp, #112]
+        ldp x5, x6, [x20]
+        adds x5, x5, x21
+        adcs x6, x6, x24
+        ldp x7, x8, [x20, #16]
+        adcs x7, x7, x22
+        adcs x8, x8, x23
+        csetm x3, cs
+        subs x9, x5, x3
+        and x1, x3, #0xffffffff
+        sbcs x5, x6, x1
+        sbcs x7, x7, xzr
+        and x2, x3, #0xffffffff00000001
+        sbc x8, x8, x2
+        stp x9, x5, [sp, #64]
+        stp x7, x8, [sp, #80]
+        ldr q20, [sp, #96]
+        ldr q0, [sp, #64]
+        rev64 v16.4s, v20.4s
+        subs x4, x9, x5
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x5, x25
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x7, x9
+        ldr q20, [sp, #112]
+        sbcs x5, x8, x5
+        ngc x17, xzr
+        subs x8, x7, x8
+        uaddlp v27.2d, v16.4s
+        umulh x4, x9, x10
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x25, x10
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #80]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x10, x26
+        sbcs x9, x25, x27
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x27, x26
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x21, x3, x13
+        adcs x22, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x23, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x24, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x21
+        adcs x15, x16, x22
+        eor x5, x17, x4
+        adcs x9, x1, x23
+        eor x1, x10, x5
+        adcs x16, x2, x24
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x21, x11, x13
+        and x1, x1, x13
+        adcs x22, x4, x1
+        and x1, x12, x13
+        stp x21, x22, [sp, #96]
+        adcs x23, x7, xzr
+        adc x24, x17, x1
+        stp x23, x24, [sp, #112]
+        ldp x4, x5, [x20, #32]
+        ldp x8, x9, [x20, #64]
+        adds x4, x4, x8
+        adcs x5, x5, x9
+        ldp x6, x7, [x20, #48]
+        ldp x10, x11, [x20, #80]
+        adcs x6, x6, x10
+        adcs x7, x7, x11
+        adc x3, xzr, xzr
+        adds x8, x4, #0x1
+        mov x9, #0xffffffff
+        sbcs x9, x5, x9
+        sbcs x10, x6, xzr
+        mov x11, #0xffffffff00000001
+        sbcs x11, x7, x11
+        sbcs x3, x3, xzr
+        csel x4, x4, x8, cc
+        csel x5, x5, x9, cc
+        csel x6, x6, x10, cc
+        csel x7, x7, x11, cc
+        stp x4, x5, [sp, #64]
+        stp x6, x7, [sp, #80]
+        ldr q20, [sp, #32]
+        ldp x7, x17, [x20, #0]
+        ldr q0, [x20, #0]
+        ldp x6, x10, [sp, #32]
+        ldp x11, x15, [x20, #16]
+        rev64 v16.4s, v20.4s
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x17, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x11, x7
+        ldr q20, [sp, #48]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2d, v16.4s
+        umulh x4, x7, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [x20, #16]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [sp, #48]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x20, x3, x13
+        adcs x25, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x26, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x27, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x20
+        adcs x15, x16, x25
+        eor x5, x17, x4
+        adcs x9, x1, x26
+        eor x1, x10, x5
+        adcs x16, x2, x27
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x20, x11, x13
+        and x1, x1, x13
+        adcs x25, x4, x1
+        and x1, x12, x13
+        stp x20, x25, [sp, #128]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [sp, #144]
+        ldr q19, [sp, #96]
+        ldr q23, [sp, #112]
+        ldr q0, [sp, #96]
+        uzp2 v29.4s, v19.4s, v19.4s
+        xtn v4.2s, v19.2d
+        umulh x8, x21, x22
+        rev64 v20.4s, v23.4s
+        umull v16.2d, v19.2s, v19.2s
+        umull v1.2d, v29.2s, v4.2s
+        mul v20.4s, v20.4s, v0.4s
+        subs x14, x21, x22
+        umulh x15, x21, x23
+        mov x16, v16.d[1]
+        umull2 v4.2d, v19.4s, v19.4s
+        mov x4, v16.d[0]
+        uzp1 v17.4s, v23.4s, v0.4s
+        uaddlp v19.2d, v20.4s
+        lsr x7, x8, #63
+        mul x11, x21, x22
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x24, x23
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4s, v0.4s, v0.4s
+        shl v19.2d, v19.2d, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2d, v20.2s, v17.2s
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x22, x24
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x23, x24
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x23, x24
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x24, x24
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x24, x24
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x23, x23
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x23, x23
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x21, x3, x16, cs
+        csel x22, x8, x14, cs
+        csel x23, x11, x12, cs
+        csel x24, x5, x2, cs
+        ldr q19, [sp, #64]
+        ldp x9, x13, [sp, #64]
+        ldr q23, [sp, #80]
+        ldr q0, [sp, #64]
+        ldp x1, x10, [sp, #80]
+        uzp2 v29.4s, v19.4s, v19.4s
+        xtn v4.2s, v19.2d
+        umulh x8, x9, x13
+        rev64 v20.4s, v23.4s
+        umull v16.2d, v19.2s, v19.2s
+        umull v1.2d, v29.2s, v4.2s
+        mul v20.4s, v20.4s, v0.4s
+        subs x14, x9, x13
+        umulh x15, x9, x1
+        mov x16, v16.d[1]
+        umull2 v4.2d, v19.4s, v19.4s
+        mov x4, v16.d[0]
+        uzp1 v17.4s, v23.4s, v0.4s
+        uaddlp v19.2d, v20.4s
+        lsr x7, x8, #63
+        mul x11, x9, x13
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4s, v0.4s, v0.4s
+        shl v19.2d, v19.2d, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2d, v20.2s, v17.2s
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x13, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x13, x3, x16, cs
+        csel x14, x8, x14, cs
+        csel x15, x11, x12, cs
+        csel x26, x5, x2, cs
+        mov x1, #0x9
+        mov x2, #0xffffffffffffffff
+        subs x9, x2, x21
+        mov x2, #0xffffffff
+        sbcs x10, x2, x24
+        ngcs x11, x22
+        mov x2, #0xffffffff00000001
+        sbc x12, x2, x23
+        mul x3, x1, x9
+        mul x4, x1, x10
+        mul x5, x1, x11
+        mul x6, x1, x12
+        umulh x9, x1, x9
+        umulh x10, x1, x10
+        umulh x11, x1, x11
+        umulh x7, x1, x12
+        adds x4, x4, x9
+        adcs x5, x5, x10
+        adcs x6, x6, x11
+        adc x7, x7, xzr
+        mov x1, #0xc
+        mul x8, x20, x1
+        umulh x9, x20, x1
+        adds x3, x3, x8
+        mul x8, x25, x1
+        umulh x10, x25, x1
+        adcs x4, x4, x8
+        ldp x11, x12, [sp, #144]
+        mul x8, x11, x1
+        umulh x11, x11, x1
+        adcs x5, x5, x8
+        mul x8, x12, x1
+        umulh x12, x12, x1
+        adcs x6, x6, x8
+        adc x7, x7, xzr
+        adds x4, x4, x9
+        adcs x5, x5, x10
+        adcs x6, x6, x11
+        adc x7, x7, x12
+        add x8, x7, #0x1
+        lsl x10, x8, #32
+        adds x6, x6, x10
+        adc x7, x7, xzr
+        neg x9, x8
+        sub x10, x10, #0x1
+        subs x3, x3, x9
+        sbcs x4, x4, x10
+        sbcs x5, x5, xzr
+        sbcs x6, x6, x8
+        sbc x8, x7, x8
+        adds x20, x3, x8
+        and x9, x8, #0xffffffff
+        adcs x21, x4, x9
+        adcs x22, x5, xzr
+        neg x10, x9
+        adc x23, x6, x10
+        stp x20, x21, [sp, #160]
+        stp x22, x23, [sp, #176]
+        mov x2, sp
+        ldp x4, x3, [x2]
+        subs x5, x13, x4
+        sbcs x6, x26, x3
+        ldp x4, x3, [x2, #16]
+        sbcs x7, x14, x4
+        sbcs x8, x15, x3
+        csetm x3, cc
+        adds x5, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x6, x6, x4
+        adcs x7, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x8, x8, x4
+        stp x5, x6, [sp, #64]
+        stp x7, x8, [sp, #80]
+        mov x0, sp
+        ldr q19, [sp, #32]
+        ldp x9, x13, [sp, #32]
+        ldr q23, [sp, #48]
+        ldr q0, [sp, #32]
+        ldp x1, x10, [sp, #48]
+        uzp2 v29.4s, v19.4s, v19.4s
+        xtn v4.2s, v19.2d
+        umulh x8, x9, x13
+        rev64 v20.4s, v23.4s
+        umull v16.2d, v19.2s, v19.2s
+        umull v1.2d, v29.2s, v4.2s
+        mul v20.4s, v20.4s, v0.4s
+        subs x14, x9, x13
+        umulh x15, x9, x1
+        mov x16, v16.d[1]
+        umull2 v4.2d, v19.4s, v19.4s
+        mov x4, v16.d[0]
+        uzp1 v17.4s, v23.4s, v0.4s
+        uaddlp v19.2d, v20.4s
+        lsr x7, x8, #63
+        mul x11, x9, x13
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4s, v0.4s, v0.4s
+        shl v19.2d, v19.2d, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2d, v20.2s, v17.2s
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x13, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x24, x3, x16, cs
+        csel x25, x8, x14, cs
+        csel x26, x11, x12, cs
+        csel x27, x5, x2, cs
+        stp x25, x26, [x0, #16]
+        stp x24, x27, [x0]
+        ldr q20, [sp, #96]
+        ldr q0, [sp, #160]
+        ldp x6, x10, [sp, #96]
+        rev64 v16.4s, v20.4s
+        subs x4, x20, x21
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4s, v16.4s, v0.4s
+        umulh x12, x21, x10
+        uzp1 v28.4s, v20.4s, v0.4s
+        subs x14, x22, x20
+        ldr q20, [sp, #112]
+        sbcs x5, x23, x21
+        ngc x17, xzr
+        subs x8, x22, x23
+        uaddlp v27.2d, v16.4s
+        umulh x4, x20, x6
+        uzp1 v21.4s, v0.4s, v0.4s
+        cneg x11, x8, cc
+        shl v17.2d, v27.2d, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2d, v21.2s, v28.2s
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [sp, #176]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2s, v20.2d
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4s, v20.4s, v20.4s
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2s, v28.2d
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [sp, #112]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x20, x3, x13
+        adcs x21, x8, x7
+        umulh x8, x14, x11
+        umull v21.2d, v0.2s, v1.2s
+        adcs x22, x10, x12
+        umull v3.2d, v0.2s, v16.2s
+        adc x23, x15, xzr
+        rev64 v24.4s, v20.4s
+        movi v2.2d, #0xffffffff
+        mul x10, x14, x11
+        mul v4.4s, v24.4s, v28.4s
+        subs x13, x14, x5
+        uzp2 v19.4s, v28.4s, v28.4s
+        csetm x15, cc
+        usra v3.2d, v21.2d, #32
+        mul x7, x5, x1
+        umull v21.2d, v19.2s, v16.2s
+        cneg x13, x13, cc
+        uaddlp v5.2d, v4.4s
+        subs x11, x1, x11
+        and v16.16b, v3.16b, v2.16b
+        umulh x5, x5, x1
+        shl v24.2d, v5.2d, #32
+        cneg x11, x11, cc
+        umlal v16.2d, v19.2s, v1.2s
+        cinv x12, x15, cc
+        umlal v24.2d, v0.2s, v1.2s
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        usra v21.2d, v3.2d, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2d, v16.2d, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x20
+        adcs x15, x16, x21
+        eor x5, x17, x4
+        adcs x9, x1, x22
+        eor x1, x10, x5
+        adcs x16, x2, x23
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x14, x11, x13
+        and x1, x1, x13
+        adcs x15, x4, x1
+        and x1, x12, x13
+        stp x14, x15, [sp, #96]
+        adcs x13, x7, xzr
+        adc x20, x17, x1
+        stp x13, x20, [sp, #112]
+        ldp x5, x6, [sp, #64]
+        ldp x4, x3, [sp, #32]
+        subs x5, x5, x4
+        sbcs x6, x6, x3
+        ldp x7, x8, [sp, #80]
+        ldp x4, x3, [sp, #48]
+        sbcs x7, x7, x4
+        sbcs x8, x8, x3
+        csetm x3, cc
+        adds x5, x5, x3
+        and x4, x3, #0xffffffff
+        adcs x6, x6, x4
+        adcs x7, x7, xzr
+        and x4, x3, #0xffffffff00000001
+        adc x8, x8, x4
+        stp x5, x6, [x19, #64]
+        stp x7, x8, [x19, #80]
+        ldp x1, x2, [sp, #128]
+        lsl x0, x1, #2
+        ldp x6, x7, [sp, #160]
+        subs x0, x0, x6
+        extr x1, x2, x1, #62
+        sbcs x1, x1, x7
+        ldp x3, x4, [sp, #144]
+        extr x2, x3, x2, #62
+        ldp x6, x7, [sp, #176]
+        sbcs x2, x2, x6
+        extr x3, x4, x3, #62
+        sbcs x3, x3, x7
+        lsr x4, x4, #62
+        sbc x4, x4, xzr
+        add x5, x4, #0x1
+        lsl x8, x5, #32
+        negs x6, x8
+        ngcs x7, xzr
+        sbc x8, x8, x5
+        adds x0, x0, x5
+        adcs x1, x1, x6
+        adcs x2, x2, x7
+        adcs x3, x3, x8
+        csetm x5, cc
+        adds x0, x0, x5
+        and x6, x5, #0xffffffff
+        adcs x1, x1, x6
+        adcs x2, x2, xzr
+        neg x7, x6
+        adc x3, x3, x7
+        stp x0, x1, [x19]
+        stp x2, x3, [x19, #16]
+        mov x2, #0xffffffffffffffff
+        subs x9, x2, x24
+        mov x2, #0xffffffff
+        sbcs x10, x2, x27
+        ngcs x11, x25
+        mov x2, #0xffffffff00000001
+        sbc x12, x2, x26
+        lsl x3, x9, #3
+        extr x4, x10, x9, #61
+        extr x5, x11, x10, #61
+        extr x6, x12, x11, #61
+        lsr x7, x12, #61
+        mov x1, #0x3
+        mul x8, x14, x1
+        umulh x9, x14, x1
+        adds x3, x3, x8
+        mul x8, x15, x1
+        umulh x10, x15, x1
+        adcs x4, x4, x8
+        mul x8, x13, x1
+        umulh x11, x13, x1
+        adcs x5, x5, x8
+        mul x8, x20, x1
+        umulh x12, x20, x1
+        adcs x6, x6, x8
+        adc x7, x7, xzr
+        adds x4, x4, x9
+        adcs x5, x5, x10
+        adcs x6, x6, x11
+        adc x7, x7, x12
+        add x8, x7, #0x1
+        lsl x10, x8, #32
+        adds x6, x6, x10
+        adc x7, x7, xzr
+        neg x9, x8
+        sub x10, x10, #0x1
+        subs x3, x3, x9
+        sbcs x4, x4, x10
+        sbcs x5, x5, xzr
+        sbcs x6, x6, x8
+        sbc x8, x7, x8
+        adds x3, x3, x8
+        and x9, x8, #0xffffffff
+        adcs x4, x4, x9
+        adcs x5, x5, xzr
+        neg x10, x9
+        adc x6, x6, x10
+        stp x3, x4, [x19, #32]
+        stp x5, x6, [x19, #48]
+
+        ldp     x27, xzr, [sp, NSPACE+64]
+        ldp     x25, x26, [sp, NSPACE+48]
+        ldp     x23, x24, [sp, NSPACE+32]
+        ldp     x21, x22, [sp, NSPACE+16]
+        ldp     x19, x20, [sp, NSPACE]
+        add     sp, sp, NSPACE+80
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble_alt.S
new file mode 100644
index 00000000000..d6079cccf4a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble_alt.S
@@ -0,0 +1,582 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjdouble_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble_alt)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x15
+#define input_x x16
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z2 sp, #(NUMSIZE*0)
+#define y4 sp, #(NUMSIZE*0)
+
+#define y2 sp, #(NUMSIZE*1)
+
+#define t1 sp, #(NUMSIZE*2)
+
+#define t2 sp, #(NUMSIZE*3)
+#define x2p sp, #(NUMSIZE*3)
+#define dx2 sp, #(NUMSIZE*3)
+
+#define xy2 sp, #(NUMSIZE*4)
+
+#define x4p sp, #(NUMSIZE*5)
+#define d sp, #(NUMSIZE*5)
+
+#define NSPACE #(NUMSIZE*6)
+
+// Corresponds exactly to bignum_montmul_p256_alt except registers
+
+#define montmul_p256(P0,P1,P2)                  \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x10, #0xffffffff00000001 __LF      \
+        adds    x13, x13, x12, lsl #32 __LF        \
+        lsr     x11, x12, #32 __LF                 \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x12, x10 __LF                 \
+        umulh   x12, x12, x10 __LF                 \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x12, x12, xzr __LF                 \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        adds    x14, x14, x13, lsl #32 __LF        \
+        lsr     x11, x13, #32 __LF                 \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x13, x10 __LF                 \
+        umulh   x13, x13, x10 __LF                 \
+        adcs    x12, x12, x11 __LF                 \
+        adc     x13, x13, xzr __LF                 \
+        adds    x0, x0, x14, lsl #32 __LF          \
+        lsr     x11, x14, #32 __LF                 \
+        adcs    x12, x12, x11 __LF                 \
+        mul     x11, x14, x10 __LF                 \
+        umulh   x14, x14, x10 __LF                 \
+        adcs    x13, x13, x11 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x12, x12, x0, lsl #32 __LF         \
+        lsr     x11, x0, #32 __LF                  \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x0, x10 __LF                  \
+        umulh   x0, x0, x10 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        adc     x0, x0, xzr __LF                   \
+        adds    x12, x12, x1 __LF                  \
+        adcs    x13, x13, x3 __LF                  \
+        adcs    x14, x14, x4 __LF                  \
+        adcs    x0, x0, x5 __LF                    \
+        cset    x8, cs __LF                        \
+        mov     x11, #0xffffffff __LF              \
+        adds    x1, x12, #0x1 __LF                 \
+        sbcs    x3, x13, x11 __LF                  \
+        sbcs    x4, x14, xzr __LF                  \
+        sbcs    x5, x0, x10 __LF                   \
+        sbcs    xzr, x8, xzr __LF                  \
+        csel    x12, x12, x1, cc __LF              \
+        csel    x13, x13, x3, cc __LF              \
+        csel    x14, x14, x4, cc __LF              \
+        csel    x0, x0, x5, cc __LF                \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16]
+
+// Corresponds exactly to bignum_montsqr_p256_alt
+
+#define montsqr_p256(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, hs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        adds    x9, x9, x8, lsl #32 __LF           \
+        lsr     x3, x8, #32 __LF                   \
+        adcs    x10, x10, x3 __LF                  \
+        mul     x2, x8, x5 __LF                    \
+        umulh   x8, x8, x5 __LF                    \
+        adcs    x11, x11, x2 __LF                  \
+        adc     x8, x8, xzr __LF                   \
+        adds    x10, x10, x9, lsl #32 __LF         \
+        lsr     x3, x9, #32 __LF                   \
+        adcs    x11, x11, x3 __LF                  \
+        mul     x2, x9, x5 __LF                    \
+        umulh   x9, x9, x5 __LF                    \
+        adcs    x8, x8, x2 __LF                    \
+        adc     x9, x9, xzr __LF                   \
+        adds    x11, x11, x10, lsl #32 __LF        \
+        lsr     x3, x10, #32 __LF                  \
+        adcs    x8, x8, x3 __LF                    \
+        mul     x2, x10, x5 __LF                   \
+        umulh   x10, x10, x5 __LF                  \
+        adcs    x9, x9, x2 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        adds    x8, x8, x11, lsl #32 __LF          \
+        lsr     x3, x11, #32 __LF                  \
+        adcs    x9, x9, x3 __LF                    \
+        mul     x2, x11, x5 __LF                   \
+        umulh   x11, x11, x5 __LF                  \
+        adcs    x10, x10, x2 __LF                  \
+        adc     x11, x11, xzr __LF                 \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x2, hs __LF                        \
+        mov     x3, #4294967295 __LF               \
+        adds    x12, x8, #1 __LF                   \
+        sbcs    x13, x9, x3 __LF                   \
+        sbcs    x14, x10, xzr __LF                 \
+        sbcs    x7, x11, x5 __LF                   \
+        sbcs    xzr, x2, xzr __LF                  \
+        csel    x8, x8, x12, lo __LF               \
+        csel    x9, x9, x13, lo __LF               \
+        csel    x10, x10, x14, lo __LF             \
+        csel    x11, x11, x7, lo __LF              \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, lo __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        and     x4, x3, #0xffffffff __LF           \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, xzr __LF                   \
+        and     x4, x3, #0xffffffff00000001 __LF   \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Corresponds exactly to bignum_add_p256
+
+#define add_p256(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        adds    x5, x5, x4 __LF                    \
+        adcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        cmn     x5, #1 __LF                        \
+        mov     x4, #4294967295 __LF               \
+        sbcs    xzr, x6, x4 __LF                   \
+        sbcs    xzr, x7, xzr __LF                  \
+        mov     x4, #-4294967295 __LF              \
+        sbcs    xzr, x8, x4 __LF                   \
+        adcs    x3, x3, xzr __LF                   \
+        csetm   x3, ne __LF                        \
+        subs    x5, x5, x3 __LF                    \
+        and     x4, x3, #0xffffffff __LF           \
+        sbcs    x6, x6, x4 __LF                    \
+        sbcs    x7, x7, xzr __LF                   \
+        and     x4, x3, #0xffffffff00000001 __LF   \
+        sbc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// A weak version of add that only guarantees sum in 4 digits
+
+#define weakadd_p256(P0,P1,P2)                  \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        adds    x5, x5, x4 __LF                    \
+        adcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        csetm   x3, cs __LF                        \
+        subs    x5, x5, x3 __LF                    \
+        and     x1, x3, #4294967295 __LF           \
+        sbcs    x6, x6, x1 __LF                    \
+        sbcs    x7, x7, xzr __LF                   \
+        and     x2, x3, #-4294967295 __LF          \
+        sbc     x8, x8, x2 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// P0 = C * P1 - D * P2 computed as D * (p_256 - P2) + C * P1
+// Quotient estimation is done just as q = h + 1 as in bignum_triple_p256
+// This also applies to the other functions following.
+
+#define cmsub_p256(P0,C,P1,D,P2)                \
+        mov     x1, D __LF                         \
+        mov     x2, #-1 __LF                       \
+        ldp     x9, x10, [P2] __LF                 \
+        subs    x9, x2, x9 __LF                    \
+        mov     x2, #4294967295 __LF               \
+        sbcs    x10, x2, x10 __LF                  \
+        ldp     x11, x12, [P2+16] __LF             \
+        sbcs    x11, xzr, x11 __LF                 \
+        mov     x2, #-4294967295 __LF              \
+        sbc     x12, x2, x12 __LF                  \
+        mul     x3, x1, x9 __LF                    \
+        mul     x4, x1, x10 __LF                   \
+        mul     x5, x1, x11 __LF                   \
+        mul     x6, x1, x12 __LF                   \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        umulh   x11, x1, x11 __LF                  \
+        umulh   x7, x1, x12 __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        mov     x1, C __LF                         \
+        ldp     x9, x10, [P1] __LF                 \
+        mul     x8, x9, x1 __LF                    \
+        umulh   x9, x9, x1 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        mul     x8, x10, x1 __LF                   \
+        umulh   x10, x10, x1 __LF                  \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x11, x12, [P1+16] __LF             \
+        mul     x8, x11, x1 __LF                   \
+        umulh   x11, x11, x1 __LF                  \
+        adcs    x5, x5, x8 __LF                    \
+        mul     x8, x12, x1 __LF                   \
+        umulh   x12, x12, x1 __LF                  \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, x12 __LF                   \
+        add     x8, x7, #1 __LF                    \
+        lsl     x10, x8, #32 __LF                  \
+        adds    x6, x6, x10 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        neg     x9, x8 __LF                        \
+        sub     x10, x10, #1 __LF                  \
+        subs    x3, x3, x9 __LF                    \
+        sbcs    x4, x4, x10 __LF                   \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, x8 __LF                    \
+        sbc     x8, x7, x8 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        and     x9, x8, #4294967295 __LF           \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, xzr __LF                   \
+        neg     x10, x9 __LF                       \
+        adc     x6, x6, x10 __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// P0 = 4 * P1 - P2, by direct subtraction of P2; the method
+// in bignum_cmul_p256 etc. for quotient estimation still
+// works when the value to be reduced is negative, as
+// long as it is  > -p_256, which is the case here. The
+// actual accumulation of q * p_256 is done a bit differently
+// so it works for the q = 0 case.
+
+#define cmsub41_p256(P0,P1,P2)                  \
+        ldp     x1, x2, [P1] __LF                  \
+        lsl     x0, x1, #2 __LF                    \
+        ldp     x6, x7, [P2] __LF                  \
+        subs    x0, x0, x6 __LF                    \
+        extr    x1, x2, x1, #62 __LF               \
+        sbcs    x1, x1, x7 __LF                    \
+        ldp     x3, x4, [P1+16] __LF               \
+        extr    x2, x3, x2, #62 __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        sbcs    x2, x2, x6 __LF                    \
+        extr    x3, x4, x3, #62 __LF               \
+        sbcs    x3, x3, x7 __LF                    \
+        lsr     x4, x4, #62 __LF                   \
+        sbc     x4, x4, xzr __LF                   \
+        add     x5, x4, #1 __LF                    \
+        lsl     x8, x5, #32 __LF                   \
+        subs    x6, xzr, x8 __LF                   \
+        sbcs    x7, xzr, xzr __LF                  \
+        sbc     x8, x8, x5 __LF                    \
+        adds    x0, x0, x5 __LF                    \
+        adcs    x1, x1, x6 __LF                    \
+        adcs    x2, x2, x7 __LF                    \
+        adcs    x3, x3, x8 __LF                    \
+        csetm   x5, cc __LF                        \
+        adds    x0, x0, x5 __LF                    \
+        and     x6, x5, #4294967295 __LF           \
+        adcs    x1, x1, x6 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        neg     x7, x6 __LF                        \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// P0 = 3 * P1 - 8 * P2, computed as (p_256 - P2) << 3 + 3 * P1
+
+#define cmsub38_p256(P0,P1,P2)                  \
+        mov     x1, 8 __LF                         \
+        mov     x2, #-1 __LF                       \
+        ldp     x9, x10, [P2] __LF                 \
+        subs    x9, x2, x9 __LF                    \
+        mov     x2, #4294967295 __LF               \
+        sbcs    x10, x2, x10 __LF                  \
+        ldp     x11, x12, [P2+16] __LF             \
+        sbcs    x11, xzr, x11 __LF                 \
+        mov     x2, #-4294967295 __LF              \
+        sbc     x12, x2, x12 __LF                  \
+        lsl     x3, x9, #3 __LF                    \
+        extr    x4, x10, x9, #61 __LF              \
+        extr    x5, x11, x10, #61 __LF             \
+        extr    x6, x12, x11, #61 __LF             \
+        lsr     x7, x12, #61 __LF                  \
+        mov     x1, 3 __LF                         \
+        ldp     x9, x10, [P1] __LF                 \
+        mul     x8, x9, x1 __LF                    \
+        umulh   x9, x9, x1 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        mul     x8, x10, x1 __LF                   \
+        umulh   x10, x10, x1 __LF                  \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x11, x12, [P1+16] __LF             \
+        mul     x8, x11, x1 __LF                   \
+        umulh   x11, x11, x1 __LF                  \
+        adcs    x5, x5, x8 __LF                    \
+        mul     x8, x12, x1 __LF                   \
+        umulh   x12, x12, x1 __LF                  \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, x12 __LF                   \
+        add     x8, x7, #1 __LF                    \
+        lsl     x10, x8, #32 __LF                  \
+        adds    x6, x6, x10 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        neg     x9, x8 __LF                        \
+        sub     x10, x10, #1 __LF                  \
+        subs    x3, x3, x9 __LF                    \
+        sbcs    x4, x4, x10 __LF                   \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, x8 __LF                    \
+        sbc     x8, x7, x8 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        and     x9, x8, #4294967295 __LF           \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, xzr __LF                   \
+        neg     x10, x9 __LF                       \
+        adc     x6, x6, x10 __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(p256_montjdouble_alt):
+
+// Make room on stack for temporary variables
+
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+
+// Main code, just a sequence of basic field operations
+
+// z2 = z^2
+// y2 = y^2
+
+        montsqr_p256(z2,z_1)
+        montsqr_p256(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        sub_p256(t2,x_1,z2)
+        weakadd_p256(t1,x_1,z2)
+        montmul_p256(x2p,t1,t2)
+
+// t1 = y + z
+// xy2 = x * y^2
+// x4p = x2p^2
+
+        add_p256(t1,y_1,z_1)
+        montmul_p256(xy2,x_1,y2)
+        montsqr_p256(x4p,x2p)
+
+// t1 = (y + z)^2
+
+        montsqr_p256(t1,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_p256(d,12,xy2,9,x4p)
+        sub_p256(t1,t1,z2)
+
+// y4 = y^4
+
+        montsqr_p256(y4,y2)
+
+// dx2 = d * x2p
+
+        montmul_p256(dx2,d,x2p)
+
+// z_3' = 2 * y * z
+
+        sub_p256(z_3,t1,y2)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_p256(x_3,xy2,d)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_p256(y_3,dx2,y4)
+
+// Restore stack and return
+
+        add     sp, sp, NSPACE
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd.S
new file mode 100644
index 00000000000..bd388d03e7a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd.S
@@ -0,0 +1,507 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjmixadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjmixadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjmixadd)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x17
+#define input_x x19
+#define input_y x20
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_montmul_p256 but uses x0 in place of x17
+
+#define montmul_p256(P0,P1,P2)                  \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2] __LF                  \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x7 __LF                   \
+        mul     x13, x4, x8 __LF                   \
+        umulh   x12, x3, x7 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x0, x12, x14 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x15, x3, x4 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        csetm   x1, lo __LF                        \
+        subs    x0, x8, x7 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        mul     x16, x15, x0 __LF                  \
+        umulh   x0, x15, x0 __LF                   \
+        cinv    x1, x1, lo __LF                    \
+        eor     x16, x16, x1 __LF                  \
+        eor     x0, x0, x1 __LF                    \
+        cmn     x1, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adc     x14, x14, x1 __LF                  \
+        lsl     x0, x11, #32 __LF                  \
+        subs    x1, x11, x0 __LF                   \
+        lsr     x16, x11, #32 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x12, x12, x0 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adcs    x14, x14, x1 __LF                  \
+        adc     x11, x11, xzr __LF                 \
+        lsl     x0, x12, #32 __LF                  \
+        subs    x1, x12, x0 __LF                   \
+        lsr     x16, x12, #32 __LF                 \
+        sbc     x12, x12, x16 __LF                 \
+        adds    x13, x13, x0 __LF                  \
+        adcs    x14, x14, x16 __LF                 \
+        adcs    x11, x11, x1 __LF                  \
+        adc     x12, x12, xzr __LF                 \
+        stp     x13, x14, [P0] __LF                \
+        stp     x11, x12, [P0+16] __LF             \
+        mul     x11, x5, x9 __LF                   \
+        mul     x13, x6, x10 __LF                  \
+        umulh   x12, x5, x9 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x6, x10 __LF                  \
+        adcs    x0, x12, x14 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x15, x5, x6 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        csetm   x1, lo __LF                        \
+        subs    x0, x10, x9 __LF                   \
+        cneg    x0, x0, lo __LF                    \
+        mul     x16, x15, x0 __LF                  \
+        umulh   x0, x15, x0 __LF                   \
+        cinv    x1, x1, lo __LF                    \
+        eor     x16, x16, x1 __LF                  \
+        eor     x0, x0, x1 __LF                    \
+        cmn     x1, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adc     x14, x14, x1 __LF                  \
+        subs    x3, x5, x3 __LF                    \
+        sbcs    x4, x6, x4 __LF                    \
+        ngc     x5, xzr __LF                       \
+        cmn     x5, #1 __LF                        \
+        eor     x3, x3, x5 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        eor     x4, x4, x5 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        subs    x7, x7, x9 __LF                    \
+        sbcs    x8, x8, x10 __LF                   \
+        ngc     x9, xzr __LF                       \
+        cmn     x9, #1 __LF                        \
+        eor     x7, x7, x9 __LF                    \
+        adcs    x7, x7, xzr __LF                   \
+        eor     x8, x8, x9 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        eor     x10, x5, x9 __LF                   \
+        ldp     x15, x1, [P0] __LF                 \
+        adds    x15, x11, x15 __LF                 \
+        adcs    x1, x12, x1 __LF                   \
+        ldp     x5, x9, [P0+16] __LF               \
+        adcs    x5, x13, x5 __LF                   \
+        adcs    x9, x14, x9 __LF                   \
+        adc     x2, xzr, xzr __LF                  \
+        mul     x11, x3, x7 __LF                   \
+        mul     x13, x4, x8 __LF                   \
+        umulh   x12, x3, x7 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x0, x12, x14 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x3, x3, x4 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        csetm   x4, lo __LF                        \
+        subs    x0, x8, x7 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        mul     x16, x3, x0 __LF                   \
+        umulh   x0, x3, x0 __LF                    \
+        cinv    x4, x4, lo __LF                    \
+        eor     x16, x16, x4 __LF                  \
+        eor     x0, x0, x4 __LF                    \
+        cmn     x4, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adc     x14, x14, x4 __LF                  \
+        cmn     x10, #1 __LF                       \
+        eor     x11, x11, x10 __LF                 \
+        adcs    x11, x11, x15 __LF                 \
+        eor     x12, x12, x10 __LF                 \
+        adcs    x12, x12, x1 __LF                  \
+        eor     x13, x13, x10 __LF                 \
+        adcs    x13, x13, x5 __LF                  \
+        eor     x14, x14, x10 __LF                 \
+        adcs    x14, x14, x9 __LF                  \
+        adcs    x3, x2, x10 __LF                   \
+        adcs    x4, x10, xzr __LF                  \
+        adc     x10, x10, xzr __LF                 \
+        adds    x13, x13, x15 __LF                 \
+        adcs    x14, x14, x1 __LF                  \
+        adcs    x3, x3, x5 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adc     x10, x10, x2 __LF                  \
+        lsl     x0, x11, #32 __LF                  \
+        subs    x1, x11, x0 __LF                   \
+        lsr     x16, x11, #32 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x12, x12, x0 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adcs    x14, x14, x1 __LF                  \
+        adc     x11, x11, xzr __LF                 \
+        lsl     x0, x12, #32 __LF                  \
+        subs    x1, x12, x0 __LF                   \
+        lsr     x16, x12, #32 __LF                 \
+        sbc     x12, x12, x16 __LF                 \
+        adds    x13, x13, x0 __LF                  \
+        adcs    x14, x14, x16 __LF                 \
+        adcs    x11, x11, x1 __LF                  \
+        adc     x12, x12, xzr __LF                 \
+        adds    x3, x3, x11 __LF                   \
+        adcs    x4, x4, x12 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        add     x2, x10, #1 __LF                   \
+        lsl     x16, x2, #32 __LF                  \
+        adds    x4, x4, x16 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        neg     x15, x2 __LF                       \
+        sub     x16, x16, #1 __LF                  \
+        subs    x13, x13, x15 __LF                 \
+        sbcs    x14, x14, x16 __LF                 \
+        sbcs    x3, x3, xzr __LF                   \
+        sbcs    x4, x4, x2 __LF                    \
+        sbcs    x7, x10, x2 __LF                   \
+        adds    x13, x13, x7 __LF                  \
+        mov     x10, #4294967295 __LF              \
+        and     x10, x10, x7 __LF                  \
+        adcs    x14, x14, x10 __LF                 \
+        adcs    x3, x3, xzr __LF                   \
+        mov     x10, #-4294967295 __LF             \
+        and     x10, x10, x7 __LF                  \
+        adc     x4, x4, x10 __LF                   \
+        stp     x13, x14, [P0] __LF                \
+        stp     x3, x4, [P0+16]
+
+// Corresponds to bignum_montsqr_p256 but uses x0 in place of x17
+
+#define montsqr_p256(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        ldp     x4, x5, [P1+16] __LF               \
+        umull   x15, w2, w2 __LF                   \
+        lsr     x11, x2, #32 __LF                  \
+        umull   x16, w11, w11 __LF                 \
+        umull   x11, w2, w11 __LF                  \
+        adds    x15, x15, x11, lsl #33 __LF        \
+        lsr     x11, x11, #31 __LF                 \
+        adc     x16, x16, x11 __LF                 \
+        umull   x0, w3, w3 __LF                    \
+        lsr     x11, x3, #32 __LF                  \
+        umull   x1, w11, w11 __LF                  \
+        umull   x11, w3, w11 __LF                  \
+        mul     x12, x2, x3 __LF                   \
+        umulh   x13, x2, x3 __LF                   \
+        adds    x0, x0, x11, lsl #33 __LF          \
+        lsr     x11, x11, #31 __LF                 \
+        adc     x1, x1, x11 __LF                   \
+        adds    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        adds    x16, x16, x12 __LF                 \
+        adcs    x0, x0, x13 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        lsl     x12, x15, #32 __LF                 \
+        subs    x13, x15, x12 __LF                 \
+        lsr     x11, x15, #32 __LF                 \
+        sbc     x15, x15, x11 __LF                 \
+        adds    x16, x16, x12 __LF                 \
+        adcs    x0, x0, x11 __LF                   \
+        adcs    x1, x1, x13 __LF                   \
+        adc     x15, x15, xzr __LF                 \
+        lsl     x12, x16, #32 __LF                 \
+        subs    x13, x16, x12 __LF                 \
+        lsr     x11, x16, #32 __LF                 \
+        sbc     x16, x16, x11 __LF                 \
+        adds    x0, x0, x12 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adcs    x15, x15, x13 __LF                 \
+        adc     x16, x16, xzr __LF                 \
+        mul     x6, x2, x4 __LF                    \
+        mul     x14, x3, x5 __LF                   \
+        umulh   x8, x2, x4 __LF                    \
+        subs    x10, x2, x3 __LF                   \
+        cneg    x10, x10, lo __LF                  \
+        csetm   x13, lo __LF                       \
+        subs    x12, x5, x4 __LF                   \
+        cneg    x12, x12, lo __LF                  \
+        mul     x11, x10, x12 __LF                 \
+        umulh   x12, x10, x12 __LF                 \
+        cinv    x13, x13, lo __LF                  \
+        eor     x11, x11, x13 __LF                 \
+        eor     x12, x12, x13 __LF                 \
+        adds    x7, x6, x8 __LF                    \
+        adc     x8, x8, xzr __LF                   \
+        umulh   x9, x3, x5 __LF                    \
+        adds    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x9 __LF                    \
+        adc     x9, x9, xzr __LF                   \
+        adds    x8, x8, x14 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        cmn     x13, #1 __LF                       \
+        adcs    x7, x7, x11 __LF                   \
+        adcs    x8, x8, x12 __LF                   \
+        adc     x9, x9, x13 __LF                   \
+        adds    x6, x6, x6 __LF                    \
+        adcs    x7, x7, x7 __LF                    \
+        adcs    x8, x8, x8 __LF                    \
+        adcs    x9, x9, x9 __LF                    \
+        adc     x10, xzr, xzr __LF                 \
+        adds    x6, x6, x0 __LF                    \
+        adcs    x7, x7, x1 __LF                    \
+        adcs    x8, x8, x15 __LF                   \
+        adcs    x9, x9, x16 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        lsl     x12, x6, #32 __LF                  \
+        subs    x13, x6, x12 __LF                  \
+        lsr     x11, x6, #32 __LF                  \
+        sbc     x6, x6, x11 __LF                   \
+        adds    x7, x7, x12 __LF                   \
+        adcs    x8, x8, x11 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x6 __LF                  \
+        adc     x6, xzr, xzr __LF                  \
+        lsl     x12, x7, #32 __LF                  \
+        subs    x13, x7, x12 __LF                  \
+        lsr     x11, x7, #32 __LF                  \
+        sbc     x7, x7, x11 __LF                   \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x13 __LF                 \
+        adcs    x6, x6, x7 __LF                    \
+        adc     x7, xzr, xzr __LF                  \
+        mul     x11, x4, x4 __LF                   \
+        adds    x8, x8, x11 __LF                   \
+        mul     x12, x5, x5 __LF                   \
+        umulh   x11, x4, x4 __LF                   \
+        adcs    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        umulh   x12, x5, x5 __LF                   \
+        adcs    x6, x6, x12 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        mul     x11, x4, x5 __LF                   \
+        umulh   x12, x4, x5 __LF                   \
+        adds    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adc     x13, xzr, xzr __LF                 \
+        adds    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        adcs    x6, x6, x13 __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        mov     x11, #4294967295 __LF              \
+        adds    x5, x8, #1 __LF                    \
+        sbcs    x11, x9, x11 __LF                  \
+        mov     x13, #-4294967295 __LF             \
+        sbcs    x12, x10, xzr __LF                 \
+        sbcs    x13, x6, x13 __LF                  \
+        sbcs    xzr, x7, xzr __LF                  \
+        csel    x8, x5, x8, hs __LF                \
+        csel    x9, x11, x9, hs __LF               \
+        csel    x10, x12, x10, hs __LF             \
+        csel    x6, x13, x6, hs __LF               \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x6, [P0+16]
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, cc __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        mov     x4, #0xffffffff __LF               \
+        and     x4, x4, x3 __LF                    \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, xzr __LF                   \
+        mov     x4, #0xffffffff00000001 __LF       \
+        and     x4, x4, x3 __LF                    \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(p256_montjmixadd):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        montsqr_p256(zp2,z_1)
+        montmul_p256(y2a,z_1,y_2)
+
+        montmul_p256(x2a,zp2,x_2)
+        montmul_p256(y2a,zp2,y2a)
+
+        sub_p256(xd,x2a,x_1)
+        sub_p256(yd,y2a,y_1)
+
+        montsqr_p256(zz,xd)
+        montsqr_p256(ww,yd)
+
+        montmul_p256(zzx1,zz,x_1)
+        montmul_p256(zzx2,zz,x2a)
+
+        sub_p256(resx,ww,zzx1)
+        sub_p256(t1,zzx2,zzx1)
+
+        montmul_p256(resz,xd,z_1)
+
+        sub_p256(resx,resx,zzx2)
+
+        sub_p256(t2,zzx1,resx)
+
+        montmul_p256(t1,t1,y_1)
+        montmul_p256(t2,yd,t2)
+
+        sub_p256(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^256 - p_256),
+// hence giving 0 + p2 = p2 for the final result.
+
+        ldp     x0, x1, [resx]
+        ldp     x12, x13, [x_2]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [resx+16]
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+
+        ldp     x4, x5, [resy]
+        ldp     x12, x13, [y_2]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [resy+16]
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+
+        ldp     x8, x9, [resz]
+        mov     x12, #0x0000000000000001
+        mov     x13, #0xffffffff00000000
+        csel    x8, x8, x12, ne
+        csel    x9, x9, x13, ne
+        ldp     x10, x11, [resz+16]
+        mov     x12, #0xffffffffffffffff
+        mov     x13, #0x00000000fffffffe
+        csel    x10, x10, x12, ne
+        csel    x11, x11, x13, ne
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd_alt.S
new file mode 100644
index 00000000000..90f49bc3568
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd_alt.S
@@ -0,0 +1,511 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjmixadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjmixadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjmixadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x15
+#define input_x x16
+#define input_y x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_montmul_p256_alt except registers
+
+#define montmul_p256(P0,P1,P2)                  \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x10, #0xffffffff00000001 __LF      \
+        adds    x13, x13, x12, lsl #32 __LF        \
+        lsr     x11, x12, #32 __LF                 \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x12, x10 __LF                 \
+        umulh   x12, x12, x10 __LF                 \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x12, x12, xzr __LF                 \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        adds    x14, x14, x13, lsl #32 __LF        \
+        lsr     x11, x13, #32 __LF                 \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x13, x10 __LF                 \
+        umulh   x13, x13, x10 __LF                 \
+        adcs    x12, x12, x11 __LF                 \
+        adc     x13, x13, xzr __LF                 \
+        adds    x0, x0, x14, lsl #32 __LF          \
+        lsr     x11, x14, #32 __LF                 \
+        adcs    x12, x12, x11 __LF                 \
+        mul     x11, x14, x10 __LF                 \
+        umulh   x14, x14, x10 __LF                 \
+        adcs    x13, x13, x11 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x12, x12, x0, lsl #32 __LF         \
+        lsr     x11, x0, #32 __LF                  \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x0, x10 __LF                  \
+        umulh   x0, x0, x10 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        adc     x0, x0, xzr __LF                   \
+        adds    x12, x12, x1 __LF                  \
+        adcs    x13, x13, x3 __LF                  \
+        adcs    x14, x14, x4 __LF                  \
+        adcs    x0, x0, x5 __LF                    \
+        cset    x8, cs __LF                        \
+        mov     x11, #0xffffffff __LF              \
+        adds    x1, x12, #0x1 __LF                 \
+        sbcs    x3, x13, x11 __LF                  \
+        sbcs    x4, x14, xzr __LF                  \
+        sbcs    x5, x0, x10 __LF                   \
+        sbcs    xzr, x8, xzr __LF                  \
+        csel    x12, x12, x1, cc __LF              \
+        csel    x13, x13, x3, cc __LF              \
+        csel    x14, x14, x4, cc __LF              \
+        csel    x0, x0, x5, cc __LF                \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16]
+
+// Corresponds exactly to bignum_montsqr_p256_alt
+
+#define montsqr_p256(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, cs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        adds    x9, x9, x8, lsl #32 __LF           \
+        lsr     x3, x8, #32 __LF                   \
+        adcs    x10, x10, x3 __LF                  \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x8, x3 __LF                    \
+        umulh   x8, x8, x3 __LF                    \
+        adcs    x11, x11, x2 __LF                  \
+        adc     x8, x8, xzr __LF                   \
+        adds    x10, x10, x9, lsl #32 __LF         \
+        lsr     x3, x9, #32 __LF                   \
+        adcs    x11, x11, x3 __LF                  \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x9, x3 __LF                    \
+        umulh   x9, x9, x3 __LF                    \
+        adcs    x8, x8, x2 __LF                    \
+        adc     x9, x9, xzr __LF                   \
+        adds    x11, x11, x10, lsl #32 __LF        \
+        lsr     x3, x10, #32 __LF                  \
+        adcs    x8, x8, x3 __LF                    \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x10, x3 __LF                   \
+        umulh   x10, x10, x3 __LF                  \
+        adcs    x9, x9, x2 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        adds    x8, x8, x11, lsl #32 __LF          \
+        lsr     x3, x11, #32 __LF                  \
+        adcs    x9, x9, x3 __LF                    \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x11, x3 __LF                   \
+        umulh   x11, x11, x3 __LF                  \
+        adcs    x10, x10, x2 __LF                  \
+        adc     x11, x11, xzr __LF                 \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x2, cs __LF                        \
+        mov     x3, #0xffffffff __LF               \
+        mov     x5, #0xffffffff00000001 __LF       \
+        adds    x12, x8, #0x1 __LF                 \
+        sbcs    x13, x9, x3 __LF                   \
+        sbcs    x14, x10, xzr __LF                 \
+        sbcs    x7, x11, x5 __LF                   \
+        sbcs    xzr, x2, xzr __LF                  \
+        csel    x8, x8, x12, cc __LF               \
+        csel    x9, x9, x13, cc __LF               \
+        csel    x10, x10, x14, cc __LF             \
+        csel    x11, x11, x7, cc __LF              \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe).
+
+#define amontsqr_p256(P0,P1)                    \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, cs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        adds    x9, x9, x8, lsl #32 __LF           \
+        lsr     x3, x8, #32 __LF                   \
+        adcs    x10, x10, x3 __LF                  \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x8, x3 __LF                    \
+        umulh   x8, x8, x3 __LF                    \
+        adcs    x11, x11, x2 __LF                  \
+        adc     x8, x8, xzr __LF                   \
+        adds    x10, x10, x9, lsl #32 __LF         \
+        lsr     x3, x9, #32 __LF                   \
+        adcs    x11, x11, x3 __LF                  \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x9, x3 __LF                    \
+        umulh   x9, x9, x3 __LF                    \
+        adcs    x8, x8, x2 __LF                    \
+        adc     x9, x9, xzr __LF                   \
+        adds    x11, x11, x10, lsl #32 __LF        \
+        lsr     x3, x10, #32 __LF                  \
+        adcs    x8, x8, x3 __LF                    \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x10, x3 __LF                   \
+        umulh   x10, x10, x3 __LF                  \
+        adcs    x9, x9, x2 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        adds    x8, x8, x11, lsl #32 __LF          \
+        lsr     x3, x11, #32 __LF                  \
+        adcs    x9, x9, x3 __LF                    \
+        mov     x3, #0xffffffff00000001 __LF       \
+        mul     x2, x11, x3 __LF                   \
+        umulh   x11, x11, x3 __LF                  \
+        adcs    x10, x10, x2 __LF                  \
+        adc     x11, x11, xzr __LF                 \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        mov     x2, #0xffffffffffffffff __LF       \
+        csel    x2, xzr, x2, cc __LF               \
+        mov     x3, #0xffffffff __LF               \
+        csel    x3, xzr, x3, cc __LF               \
+        mov     x5, #0xffffffff00000001 __LF       \
+        csel    x5, xzr, x5, cc __LF               \
+        subs    x8, x8, x2 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, x5 __LF                  \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, cc __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        mov     x4, #0xffffffff __LF               \
+        and     x4, x4, x3 __LF                    \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, xzr __LF                   \
+        mov     x4, #0xffffffff00000001 __LF       \
+        and     x4, x4, x3 __LF                    \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(p256_montjmixadd_alt):
+
+// Make room on stack for temporary variables
+// Move the input arguments to stable places
+
+        sub     sp, sp, NSPACE
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        amontsqr_p256(zp2,z_1)
+        montmul_p256(y2a,z_1,y_2)
+
+        montmul_p256(x2a,zp2,x_2)
+        montmul_p256(y2a,zp2,y2a)
+
+        sub_p256(xd,x2a,x_1)
+        sub_p256(yd,y2a,y_1)
+
+        amontsqr_p256(zz,xd)
+        montsqr_p256(ww,yd)
+
+        montmul_p256(zzx1,zz,x_1)
+        montmul_p256(zzx2,zz,x2a)
+
+        sub_p256(resx,ww,zzx1)
+        sub_p256(t1,zzx2,zzx1)
+
+        montmul_p256(resz,xd,z_1)
+
+        sub_p256(resx,resx,zzx2)
+
+        sub_p256(t2,zzx1,resx)
+
+        montmul_p256(t1,t1,y_1)
+        montmul_p256(t2,yd,t2)
+
+        sub_p256(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^256 - p_256),
+// hence giving 0 + p2 = p2 for the final result.
+
+        ldp     x0, x1, [resx]
+        ldp     x12, x13, [x_2]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [resx+16]
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+
+        ldp     x4, x5, [resy]
+        ldp     x12, x13, [y_2]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [resy+16]
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+
+        ldp     x8, x9, [resz]
+        mov     x12, #0x0000000000000001
+        mov     x13, #0xffffffff00000000
+        csel    x8, x8, x12, ne
+        csel    x9, x9, x13, ne
+        ldp     x10, x11, [resz+16]
+        mov     x12, #0xffffffffffffffff
+        mov     x13, #0x00000000fffffffe
+        csel    x10, x10, x12, ne
+        csel    x11, x11, x13, ne
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore stack and return
+
+        add     sp, sp, NSPACE
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul.S
similarity index 99%
rename from third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul.S
index 246421ff37d..51f05ac2732 100644
--- a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul.S
@@ -60,33 +60,33 @@
 // which doesn't accept repetitions, assembler macros etc.
 
 #define selectblock(I)                          \
-        cmp     x14, #(1*I);                    \
-        ldp     x12, x13, [x15];                \
-        csel    x0, x12, x0, eq;                \
-        csel    x1, x13, x1, eq;                \
-        ldp     x12, x13, [x15, #16];           \
-        csel    x2, x12, x2, eq;                \
-        csel    x3, x13, x3, eq;                \
-        ldp     x12, x13, [x15, #32];           \
-        csel    x4, x12, x4, eq;                \
-        csel    x5, x13, x5, eq;                \
-        ldp     x12, x13, [x15, #48];           \
-        csel    x6, x12, x6, eq;                \
-        csel    x7, x13, x7, eq;                \
-        ldp     x12, x13, [x15, #64];           \
-        csel    x8, x12, x8, eq;                \
-        csel    x9, x13, x9, eq;                \
-        ldp     x12, x13, [x15, #80];           \
-        csel    x10, x12, x10, eq;              \
-        csel    x11, x13, x11, eq;              \
+        cmp     x14, #(1*I) __LF                   \
+        ldp     x12, x13, [x15] __LF               \
+        csel    x0, x12, x0, eq __LF               \
+        csel    x1, x13, x1, eq __LF               \
+        ldp     x12, x13, [x15, #16] __LF          \
+        csel    x2, x12, x2, eq __LF               \
+        csel    x3, x13, x3, eq __LF               \
+        ldp     x12, x13, [x15, #32] __LF          \
+        csel    x4, x12, x4, eq __LF               \
+        csel    x5, x13, x5, eq __LF               \
+        ldp     x12, x13, [x15, #48] __LF          \
+        csel    x6, x12, x6, eq __LF               \
+        csel    x7, x13, x7, eq __LF               \
+        ldp     x12, x13, [x15, #64] __LF          \
+        csel    x8, x12, x8, eq __LF               \
+        csel    x9, x13, x9, eq __LF               \
+        ldp     x12, x13, [x15, #80] __LF          \
+        csel    x10, x12, x10, eq __LF             \
+        csel    x11, x13, x11, eq __LF             \
         add     x15, x15, #96
 
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 S2N_BN_SYMBOL(p256_montjscalarmul):
diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul_alt.S
similarity index 98%
rename from third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul_alt.S
index 8ac5806a725..74ed964f9ae 100644
--- a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul_alt.S
@@ -60,33 +60,33 @@
 // which doesn't accept repetitions, assembler macros etc.
 
 #define selectblock(I)                          \
-        cmp     x14, #(1*I);                    \
-        ldp     x12, x13, [x15];                \
-        csel    x0, x12, x0, eq;                \
-        csel    x1, x13, x1, eq;                \
-        ldp     x12, x13, [x15, #16];           \
-        csel    x2, x12, x2, eq;                \
-        csel    x3, x13, x3, eq;                \
-        ldp     x12, x13, [x15, #32];           \
-        csel    x4, x12, x4, eq;                \
-        csel    x5, x13, x5, eq;                \
-        ldp     x12, x13, [x15, #48];           \
-        csel    x6, x12, x6, eq;                \
-        csel    x7, x13, x7, eq;                \
-        ldp     x12, x13, [x15, #64];           \
-        csel    x8, x12, x8, eq;                \
-        csel    x9, x13, x9, eq;                \
-        ldp     x12, x13, [x15, #80];           \
-        csel    x10, x12, x10, eq;              \
-        csel    x11, x13, x11, eq;              \
+        cmp     x14, #(1*I) __LF                   \
+        ldp     x12, x13, [x15] __LF               \
+        csel    x0, x12, x0, eq __LF               \
+        csel    x1, x13, x1, eq __LF               \
+        ldp     x12, x13, [x15, #16] __LF          \
+        csel    x2, x12, x2, eq __LF               \
+        csel    x3, x13, x3, eq __LF               \
+        ldp     x12, x13, [x15, #32] __LF          \
+        csel    x4, x12, x4, eq __LF               \
+        csel    x5, x13, x5, eq __LF               \
+        ldp     x12, x13, [x15, #48] __LF          \
+        csel    x6, x12, x6, eq __LF               \
+        csel    x7, x13, x7, eq __LF               \
+        ldp     x12, x13, [x15, #64] __LF          \
+        csel    x8, x12, x8, eq __LF               \
+        csel    x9, x13, x9, eq __LF               \
+        ldp     x12, x13, [x15, #80] __LF          \
+        csel    x10, x12, x10, eq __LF             \
+        csel    x11, x13, x11, eq __LF             \
         add     x15, x15, #96
 
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 S2N_BN_SYMBOL(p256_montjscalarmul_alt):
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul.S
new file mode 100644
index 00000000000..317008a461d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul.S
@@ -0,0 +1,8575 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Scalar multiplication for P-256
+// Input scalar[4], point[8]; output res[8]
+//
+// extern void p256_scalarmul
+//   (uint64_t res[static 8],
+//    uint64_t scalar[static 4],
+//    uint64_t point[static 8]);
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve P-256, returns the point (X,Y) = n * P. The input and output
+// are affine points, and in the case of the point at infinity as
+// the result, (0,0) is returned.
+//
+// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmul)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Safe copies of inputs (res lasts the whole code, point not so long)
+// and additional values in variables, with some aliasing
+
+#define res x19
+#define sgn x20
+#define j x20
+#define point x21
+
+// Intermediate variables on the stack. The last z2, z3 values can
+// safely be overlaid on the table, which is no longer needed at the end.
+
+#define scalarb sp, #(0*NUMSIZE)
+#define acc sp, #(1*NUMSIZE)
+#define tabent sp, #(4*NUMSIZE)
+
+#define tab sp, #(7*NUMSIZE)
+
+#define z2 sp, #(7*NUMSIZE)
+#define z3 sp, #(8*NUMSIZE)
+
+#define NSPACE #(31*NUMSIZE)
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(p256_scalarmul):
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x30, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Preserve the "res" and "point" input arguments. We load and process the
+// scalar immediately so we don't bother preserving that input argument.
+// Also, "point" is only needed early on and so its register gets re-used.
+
+        mov     res, x0
+        mov     point, x2
+
+// Load the digits of group order n_256 = [x12;x13;x14;x15]
+
+        movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551)
+        movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84)
+        mov     x14, #0xffffffffffffffff
+        mov     x15, #0xffffffff00000000
+
+// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+
+        subs    x6, x2, x12
+        sbcs    x7, x3, x13
+        sbcs    x8, x4, x14
+        sbcs    x9, x5, x15
+
+        csel    x2, x2, x6, cc
+        csel    x3, x3, x7, cc
+        csel    x4, x4, x8, cc
+        csel    x5, x5, x9, cc
+
+// Now if the top bit of the reduced scalar is set, negate it mod n_256,
+// i.e. do n |-> n_256 - n. Remember the sign as "sgn" so we can
+// correspondingly negate the point below.
+
+        subs    x6, x12, x2
+        sbcs    x7, x13, x3
+        sbcs    x8, x14, x4
+        sbc     x9, x15, x5
+
+        tst     x5, #0x8000000000000000
+        csel    x2, x2, x6, eq
+        csel    x3, x3, x7, eq
+        csel    x4, x4, x8, eq
+        csel    x5, x5, x9, eq
+        cset    sgn, ne
+
+// In either case then add the recoding constant 0x08888...888 to allow
+// signed digits.
+
+        mov     x6, 0x8888888888888888
+        adds    x2, x2, x6
+        adcs    x3, x3, x6
+        bic     x7, x6, #0xF000000000000000
+        adcs    x4, x4, x6
+        adc     x5, x5, x7
+
+        stp     x2, x3, [scalarb]
+        stp     x4, x5, [scalarb+16]
+
+// Set the tab[0] table entry to Montgomery-Jacobian point = 1 * P
+// The z coordinate is just the Montgomery form of the constant 1.
+
+        add     x0, tab
+        mov     x1, point
+        bl      p256_scalarmul_local_tomont_p256
+
+        add     x1, point, #32
+        add     x0, tab+32
+        bl      p256_scalarmul_local_tomont_p256
+
+        mov     x0, #0x0000000000000001
+        mov     x1, #0xffffffff00000000
+        stp     x0, x1, [tab+64]
+        mov     x2, #0xffffffffffffffff
+        mov     x3, #0x00000000fffffffe
+        stp     x2, x3, [tab+80]
+
+// If the top bit of the scalar was set, negate (y coordinate of) the point
+
+        ldp     x4, x5, [tab+32]
+        ldp     x6, x7, [tab+48]
+
+        mov     x0, 0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, 0x00000000ffffffff
+        sbcs    x1, x1, x5
+        mov     x3, 0xffffffff00000001
+        sbcs    x2, xzr, x6
+        sbc     x3, x3, x7
+
+        cmp     sgn, xzr
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tab+32]
+        stp     x6, x7, [tab+48]
+
+// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P
+
+        add     x0, tab+96*1
+        add     x1, tab
+        bl      p256_scalarmul_local_p256_montjdouble
+
+        add     x0, tab+96*2
+        add     x1, tab+96*1
+        add     x2, tab
+        bl      p256_scalarmul_local_p256_montjmixadd
+
+        add     x0, tab+96*3
+        add     x1, tab+96*1
+        bl      p256_scalarmul_local_p256_montjdouble
+
+        add     x0, tab+96*4
+        add     x1, tab+96*3
+        add     x2, tab
+        bl      p256_scalarmul_local_p256_montjmixadd
+
+        add     x0, tab+96*5
+        add     x1, tab+96*2
+        bl      p256_scalarmul_local_p256_montjdouble
+
+        add     x0, tab+96*6
+        add     x1, tab+96*5
+        add     x2, tab
+        bl      p256_scalarmul_local_p256_montjmixadd
+
+        add     x0, tab+96*7
+        add     x1, tab+96*3
+        bl      p256_scalarmul_local_p256_montjdouble
+
+// Initialize the accumulator as a table entry for top 4 bits (unrecoded)
+
+        ldr     x14, [scalarb+24]
+        lsr     x14, x14, #60
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        add     x15, tab
+
+        .set i, 1
+.rep 8
+        cmp     x14, #i
+        ldp     x12, x13, [x15]
+        csel    x0, x12, x0, eq
+        csel    x1, x13, x1, eq
+        ldp     x12, x13, [x15, #16]
+        csel    x2, x12, x2, eq
+        csel    x3, x13, x3, eq
+        ldp     x12, x13, [x15, #32]
+        csel    x4, x12, x4, eq
+        csel    x5, x13, x5, eq
+        ldp     x12, x13, [x15, #48]
+        csel    x6, x12, x6, eq
+        csel    x7, x13, x7, eq
+        ldp     x12, x13, [x15, #64]
+        csel    x8, x12, x8, eq
+        csel    x9, x13, x9, eq
+        ldp     x12, x13, [x15, #80]
+        csel    x10, x12, x10, eq
+        csel    x11, x13, x11, eq
+        add     x15, x15, #96
+        .set    i, (i+1)
+.endr
+        stp     x0, x1, [acc]
+        stp     x2, x3, [acc+16]
+        stp     x4, x5, [acc+32]
+        stp     x6, x7, [acc+48]
+        stp     x8, x9, [acc+64]
+        stp     x10, x11, [acc+80]
+
+        mov     j, #252
+
+// Main loop over size-4 bitfields: double 4 times then add signed digit
+
+p256_scalarmul_loop:
+        sub     j, j, #4
+
+        add     x0, acc
+        add     x1, acc
+        bl      p256_scalarmul_local_p256_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p256_scalarmul_local_p256_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p256_scalarmul_local_p256_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p256_scalarmul_local_p256_montjdouble
+
+        lsr     x2, j, #6
+        ldr     x14, [sp, x2, lsl #3]   // Exploits scalarb = sp exactly
+        lsr     x14, x14, j
+        and     x14, x14, #15
+
+        subs    x14, x14, #8
+        cset    x16, lo                 // x16 = sign of digit (1 = negative)
+        cneg    x14, x14, lo            // x14 = absolute value of digit
+
+// Conditionally select the table entry tab[i-1] = i * P in constant time
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        add     x15, tab
+        .set i, 1
+.rep 8
+        cmp     x14, #i
+        ldp     x12, x13, [x15]
+        csel    x0, x12, x0, eq
+        csel    x1, x13, x1, eq
+        ldp     x12, x13, [x15, #16]
+        csel    x2, x12, x2, eq
+        csel    x3, x13, x3, eq
+        ldp     x12, x13, [x15, #32]
+        csel    x4, x12, x4, eq
+        csel    x5, x13, x5, eq
+        ldp     x12, x13, [x15, #48]
+        csel    x6, x12, x6, eq
+        csel    x7, x13, x7, eq
+        ldp     x12, x13, [x15, #64]
+        csel    x8, x12, x8, eq
+        csel    x9, x13, x9, eq
+        ldp     x12, x13, [x15, #80]
+        csel    x10, x12, x10, eq
+        csel    x11, x13, x11, eq
+        add     x15, x15, #96
+        .set    i, (i+1)
+.endr
+
+// Store it to "tabent" with the y coordinate optionally negated
+
+        stp     x0, x1, [tabent]
+        stp     x2, x3, [tabent+16]
+
+        mov     x0, 0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, 0x00000000ffffffff
+        sbcs    x1, x1, x5
+        mov     x3, 0xffffffff00000001
+        sbcs    x2, xzr, x6
+        sbc     x3, x3, x7
+
+        cmp     x16, xzr
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tabent+32]
+        stp     x6, x7, [tabent+48]
+        stp     x8, x9, [tabent+64]
+        stp     x10, x11, [tabent+80]
+
+        add     x0, acc
+        add     x1, acc
+        add     x2, tabent
+        bl      p256_scalarmul_local_p256_montjadd
+
+        cbnz    j, p256_scalarmul_loop
+
+// That's the end of the main loop, and we just need to translate
+// back from the Jacobian representation to affine. First of all,
+// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form
+
+        add     x0, z2
+        add     x1, acc+64
+        bl      p256_scalarmul_local_montsqr_p256
+
+        add     x0, z3
+        add     x2, z2
+        add     x1, acc+64
+        bl      p256_scalarmul_local_montmul_p256
+
+        add     x0, z2
+        add     x1, z3
+        bl      p256_scalarmul_local_demont_p256
+
+        add     x0, z3
+        add     x1, z2
+        bl      p256_scalarmul_local_inv_p256
+
+        add     x0, z2
+        add     x2, z3
+        add     x1, acc+64
+        bl      p256_scalarmul_local_montmul_p256
+
+// Convert back from Jacobian (X,Y,Z) |-> (X/Z^2, Y/Z^3)
+
+        add     x1, acc
+        add     x2, z2
+        mov     x0, res
+        bl      p256_scalarmul_local_montmul_p256
+
+        add     x0, res, #32
+        add     x1, acc+32
+        add     x2, z3
+        bl      p256_scalarmul_local_montmul_p256
+
+// Restore stack and registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x21, x30, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+p256_scalarmul_local_demont_p256:
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        lsl     x7, x2, #32
+        subs    x8, x2, x7
+        lsr     x6, x2, #32
+        sbc     x2, x2, x6
+        adds    x3, x3, x7
+        adcs    x4, x4, x6
+        adcs    x5, x5, x8
+        adc     x2, x2, xzr
+        lsl     x7, x3, #32
+        subs    x8, x3, x7
+        lsr     x6, x3, #32
+        sbc     x3, x3, x6
+        adds    x4, x4, x7
+        adcs    x5, x5, x6
+        adcs    x2, x2, x8
+        adc     x3, x3, xzr
+        lsl     x7, x4, #32
+        subs    x8, x4, x7
+        lsr     x6, x4, #32
+        sbc     x4, x4, x6
+        adds    x5, x5, x7
+        adcs    x2, x2, x6
+        adcs    x3, x3, x8
+        adc     x4, x4, xzr
+        lsl     x7, x5, #32
+        subs    x8, x5, x7
+        lsr     x6, x5, #32
+        sbc     x5, x5, x6
+        adds    x2, x2, x7
+        adcs    x3, x3, x6
+        adcs    x4, x4, x8
+        adc     x5, x5, xzr
+        stp     x2, x3, [x0]
+        stp     x4, x5, [x0, #16]
+        ret
+
+p256_scalarmul_local_inv_p256:
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        sub     sp, sp, #0xa0
+        mov     x20, x0
+        mov     x10, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x13, #0xffffffff00000001
+        stp     x10, x11, [sp]
+        stp     xzr, x13, [sp, #16]
+        str     xzr, [sp, #32]
+        ldp     x2, x3, [x1]
+        subs    x10, x2, x10
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #16]
+        sbcs    x12, x4, xzr
+        sbcs    x13, x5, x13
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+        stp     x2, x3, [sp, #48]
+        stp     x4, x5, [sp, #64]
+        str     xzr, [sp, #80]
+        stp     xzr, xzr, [sp, #96]
+        stp     xzr, xzr, [sp, #112]
+        mov     x10, #0x4000000000000
+        stp     x10, xzr, [sp, #128]
+        stp     xzr, xzr, [sp, #144]
+        mov     x21, #0xa
+        mov     x22, #0x1
+        b       p256_scalarmul_inv_midloop
+p256_scalarmul_inv_loop:
+        cmp     x10, xzr
+        csetm   x14, mi
+        cneg    x10, x10, mi
+        cmp     x11, xzr
+        csetm   x15, mi
+        cneg    x11, x11, mi
+        cmp     x12, xzr
+        csetm   x16, mi
+        cneg    x12, x12, mi
+        cmp     x13, xzr
+        csetm   x17, mi
+        cneg    x13, x13, mi
+        and     x0, x10, x14
+        and     x1, x11, x15
+        add     x9, x0, x1
+        and     x0, x12, x16
+        and     x1, x13, x17
+        add     x19, x0, x1
+        ldr     x7, [sp]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #48]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x5, x19, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x7, [sp, #8]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #56]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [sp]
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [sp, #48]
+        ldr     x7, [sp, #16]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #64]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [sp, #8]
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [sp, #56]
+        ldr     x7, [sp, #24]
+        eor     x1, x7, x14
+        ldr     x23, [sp, #32]
+        eor     x3, x23, x14
+        and     x3, x3, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #72]
+        eor     x1, x8, x15
+        ldr     x24, [sp, #80]
+        eor     x0, x24, x15
+        and     x0, x0, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [sp, #16]
+        extr    x5, x3, x5, #59
+        str     x5, [sp, #24]
+        asr     x3, x3, #59
+        str     x3, [sp, #32]
+        eor     x1, x7, x16
+        eor     x5, x23, x16
+        and     x5, x5, x12
+        neg     x5, x5
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, x17
+        eor     x0, x24, x17
+        and     x0, x0, x13
+        sub     x5, x5, x0
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [sp, #64]
+        extr    x2, x5, x2, #59
+        str     x2, [sp, #72]
+        asr     x5, x5, #59
+        str     x5, [sp, #80]
+        ldr     x7, [sp, #96]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #128]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        str     x4, [sp, #96]
+        adc     x2, x2, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x5, x19, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x5, x5, x0
+        str     x5, [sp, #128]
+        adc     x3, x3, x1
+        ldr     x7, [sp, #104]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #136]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        str     x2, [sp, #104]
+        adc     x6, x6, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x3, x3, x0
+        str     x3, [sp, #136]
+        adc     x4, x4, x1
+        ldr     x7, [sp, #112]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #144]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        str     x6, [sp, #112]
+        adc     x5, x5, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x4, x4, x0
+        str     x4, [sp, #144]
+        adc     x2, x2, x1
+        ldr     x7, [sp, #120]
+        eor     x1, x7, x14
+        and     x3, x14, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #152]
+        eor     x1, x8, x15
+        and     x0, x15, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldp     x0, x1, [sp, #96]
+        ldr     x6, [sp, #112]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x6, x6, x11
+        mov     x10, #0x2000000000000000
+        adcs    x5, x5, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x3, x3, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x6, x6, x10
+        adcs    x5, x5, x14
+        adcs    x3, x3, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x6, x6, x11
+        sbcs    x5, x5, xzr
+        sbc     x3, x3, x10
+        stp     x1, x6, [sp, #96]
+        stp     x5, x3, [sp, #112]
+        eor     x1, x7, x16
+        and     x5, x16, x12
+        neg     x5, x5
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, x17
+        and     x0, x17, x13
+        sub     x5, x5, x0
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        ldp     x0, x1, [sp, #128]
+        ldr     x3, [sp, #144]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x3, x3, x11
+        mov     x10, #0x2000000000000000
+        adcs    x2, x2, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x5, x5, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x3, x3, x10
+        adcs    x2, x2, x14
+        adcs    x5, x5, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x3, x3, x11
+        sbcs    x2, x2, xzr
+        sbc     x5, x5, x10
+        stp     x1, x3, [sp, #128]
+        stp     x2, x5, [sp, #144]
+p256_scalarmul_inv_midloop:
+        mov     x1, x22
+        ldr     x2, [sp]
+        ldr     x3, [sp, #48]
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x8, x4, #0x100, lsl #12
+        sbfx    x8, x8, #21, #21
+        mov     x11, #0x100000
+        add     x11, x11, x11, lsl #21
+        add     x9, x4, x11
+        asr     x9, x9, #42
+        add     x10, x5, #0x100, lsl #12
+        sbfx    x10, x10, #21, #21
+        add     x11, x5, x11
+        asr     x11, x11, #42
+        mul     x6, x8, x2
+        mul     x7, x9, x3
+        mul     x2, x10, x2
+        mul     x3, x11, x3
+        add     x4, x6, x7
+        add     x5, x2, x3
+        asr     x2, x4, #20
+        asr     x3, x5, #20
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x12, x4, #0x100, lsl #12
+        sbfx    x12, x12, #21, #21
+        mov     x15, #0x100000
+        add     x15, x15, x15, lsl #21
+        add     x13, x4, x15
+        asr     x13, x13, #42
+        add     x14, x5, #0x100, lsl #12
+        sbfx    x14, x14, #21, #21
+        add     x15, x5, x15
+        asr     x15, x15, #42
+        mul     x6, x12, x2
+        mul     x7, x13, x3
+        mul     x2, x14, x2
+        mul     x3, x15, x3
+        add     x4, x6, x7
+        add     x5, x2, x3
+        asr     x2, x4, #20
+        asr     x3, x5, #20
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        mul     x2, x12, x8
+        mul     x3, x12, x9
+        mul     x6, x14, x8
+        mul     x7, x14, x9
+        madd    x8, x13, x10, x2
+        madd    x9, x13, x11, x3
+        madd    x16, x15, x10, x6
+        madd    x17, x15, x11, x7
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x12, x4, #0x100, lsl #12
+        sbfx    x12, x12, #22, #21
+        mov     x15, #0x100000
+        add     x15, x15, x15, lsl #21
+        add     x13, x4, x15
+        asr     x13, x13, #43
+        add     x14, x5, #0x100, lsl #12
+        sbfx    x14, x14, #22, #21
+        add     x15, x5, x15
+        asr     x15, x15, #43
+        mneg    x2, x12, x8
+        mneg    x3, x12, x9
+        mneg    x4, x14, x8
+        mneg    x5, x14, x9
+        msub    x10, x13, x16, x2
+        msub    x11, x13, x17, x3
+        msub    x12, x15, x16, x4
+        msub    x13, x15, x17, x5
+        mov     x22, x1
+        subs    x21, x21, #0x1
+        b.ne    p256_scalarmul_inv_loop
+        ldr     x0, [sp]
+        ldr     x1, [sp, #48]
+        mul     x0, x0, x10
+        madd    x1, x1, x11, x0
+        asr     x0, x1, #63
+        cmp     x10, xzr
+        csetm   x14, mi
+        cneg    x10, x10, mi
+        eor     x14, x14, x0
+        cmp     x11, xzr
+        csetm   x15, mi
+        cneg    x11, x11, mi
+        eor     x15, x15, x0
+        cmp     x12, xzr
+        csetm   x16, mi
+        cneg    x12, x12, mi
+        eor     x16, x16, x0
+        cmp     x13, xzr
+        csetm   x17, mi
+        cneg    x13, x13, mi
+        eor     x17, x17, x0
+        and     x0, x10, x14
+        and     x1, x11, x15
+        add     x9, x0, x1
+        ldr     x7, [sp, #96]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #128]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        str     x4, [sp, #96]
+        adc     x2, x2, x1
+        ldr     x7, [sp, #104]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #136]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        str     x2, [sp, #104]
+        adc     x6, x6, x1
+        ldr     x7, [sp, #112]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #144]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        str     x6, [sp, #112]
+        adc     x5, x5, x1
+        ldr     x7, [sp, #120]
+        eor     x1, x7, x14
+        and     x3, x14, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #152]
+        eor     x1, x8, x15
+        and     x0, x15, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldp     x0, x1, [sp, #96]
+        ldr     x2, [sp, #112]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x2, x2, x11
+        mov     x10, #0x2000000000000000
+        adcs    x5, x5, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x3, x3, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x2, x2, x10
+        adcs    x5, x5, x14
+        adcs    x3, x3, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x2, x2, x11
+        sbcs    x5, x5, xzr
+        sbc     x3, x3, x10
+        mov     x10, #0xffffffffffffffff
+        subs    x10, x1, x10
+        mov     x11, #0xffffffff
+        sbcs    x11, x2, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x5, xzr
+        sbcs    x13, x3, x13
+        csel    x10, x1, x10, cc
+        csel    x11, x2, x11, cc
+        csel    x12, x5, x12, cc
+        csel    x13, x3, x13, cc
+        stp     x10, x11, [x20]
+        stp     x12, x13, [x20, #16]
+        add     sp, sp, #0xa0
+        ldp     x23, x24, [sp], #16
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+        ret
+
+p256_scalarmul_local_montmul_p256:
+        ldr q20, [x2]
+        ldp x7, x17, [x1]
+        ldr q0, [x1]
+        ldp x6, x10, [x2]
+        ldp x11, x15, [x1, #16]
+        rev64 v16.4S, v20.4S
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4S, v16.4S, v0.4S
+        umulh x12, x17, x10
+        uzp1 v28.4S, v20.4S, v0.4S
+        subs x14, x11, x7
+        ldr q20, [x2, #16]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2D, v16.4S
+        umulh x4, x7, x6
+        uzp1 v21.4S, v0.4S, v0.4S
+        cneg x11, x8, cc
+        shl v17.2D, v27.2D, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2D, v21.2S, v28.2S
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [x1, #16]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2S, v20.2D
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4S, v20.4S, v20.4S
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2S, v28.2D
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [x2, #16]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x9, x3, x13
+        adcs x3, x8, x7
+        umulh x8, x14, x11
+        umull v21.2D, v0.2S, v1.2S
+        adcs x12, x10, x12
+        umull v3.2D, v0.2S, v16.2S
+        adc x15, x15, xzr
+        rev64 v24.4S, v20.4S
+        stp x12, x15, [x0, #16]
+        movi v2.2D, #0x00000000ffffffff
+        mul x10, x14, x11
+        mul v4.4S, v24.4S, v28.4S
+        subs x13, x14, x5
+        uzp2 v19.4S, v28.4S, v28.4S
+        csetm x15, cc
+        usra v3.2D, v21.2D, #32
+        mul x7, x5, x1
+        umull v21.2D, v19.2S, v16.2S
+        cneg x13, x13, cc
+        uaddlp v5.2D, v4.4S
+        subs x11, x1, x11
+        and v16.16B, v3.16B, v2.16B
+        umulh x5, x5, x1
+        shl v24.2D, v5.2D, #32
+        cneg x11, x11, cc
+        umlal v16.2D, v19.2S, v1.2S
+        cinv x12, x15, cc
+        umlal v24.2D, v0.2S, v1.2S
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        stp x9, x3, [x0]
+        usra v21.2D, v3.2D, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2D, v16.2D, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        ldp x15, x8, [x0]
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        ldp x9, x13, [x0, #16]
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x15
+        adcs x15, x16, x8
+        eor x5, x17, x4
+        adcs x9, x1, x9
+        eor x1, x10, x5
+        adcs x16, x2, x13
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x11, x11, x13
+        and x1, x1, x13
+        adcs x4, x4, x1
+        and x1, x12, x13
+        stp x11, x4, [x0]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [x0, #16]
+        ret
+
+p256_scalarmul_local_montsqr_p256:
+        ldr q19, [x1]
+        ldp x9, x13, [x1]
+        ldr q23, [x1, #16]
+        ldr q0, [x1]
+        ldp x1, x10, [x1, #16]
+        uzp2 v29.4S, v19.4S, v19.4S
+        xtn v4.2S, v19.2D
+        umulh x8, x9, x13
+        rev64 v20.4S, v23.4S
+        umull v16.2D, v19.2S, v19.2S
+        umull v1.2D, v29.2S, v4.2S
+        mul v20.4S, v20.4S, v0.4S
+        subs x14, x9, x13
+        umulh x15, x9, x1
+        mov x16, v16.d[1]
+        umull2 v4.2D, v19.4S, v19.4S
+        mov x4, v16.d[0]
+        uzp1 v17.4S, v23.4S, v0.4S
+        uaddlp v19.2D, v20.4S
+        lsr x7, x8, #63
+        mul x11, x9, x13
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4S, v0.4S, v0.4S
+        shl v19.2D, v19.2D, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2D, v20.2S, v17.2S
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x13, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x16, x3, x16, cs
+        csel x14, x8, x14, cs
+        csel x12, x11, x12, cs
+        csel x2, x5, x2, cs
+        stp x14, x12, [x0, #16]
+        stp x16, x2, [x0]
+        ret
+
+p256_scalarmul_local_tomont_p256:
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        mov     x1, #0xffffffffffffffff
+        mov     x7, #0xffffffff
+        mov     x9, #0xffffffff00000001
+        subs    x1, x2, x1
+        sbcs    x7, x3, x7
+        sbcs    x8, x4, xzr
+        sbcs    x9, x5, x9
+        csel    x2, x2, x1, cc
+        csel    x3, x3, x7, cc
+        csel    x4, x4, x8, cc
+        csel    x5, x5, x9, cc
+        cmp     xzr, xzr
+        extr    x9, x5, x4, #32
+        adcs    xzr, x4, x9
+        lsr     x9, x5, #32
+        adcs    x9, x5, x9
+        csetm   x6, cs
+        orr     x9, x9, x6
+        lsl     x7, x9, #32
+        lsr     x8, x9, #32
+        adds    x4, x4, x7
+        adc     x5, x5, x8
+        negs    x6, x9
+        sbcs    x7, x7, xzr
+        sbc     x8, x8, xzr
+        negs    x6, x6
+        sbcs    x2, x2, x7
+        sbcs    x3, x3, x8
+        sbcs    x4, x4, x9
+        sbcs    x5, x5, x9
+        adds    x6, x6, x5
+        mov     x7, #0xffffffff
+        and     x7, x7, x5
+        adcs    x2, x2, x7
+        adcs    x3, x3, xzr
+        mov     x7, #0xffffffff00000001
+        and     x7, x7, x5
+        adc     x4, x4, x7
+        cmp     xzr, xzr
+        extr    x9, x4, x3, #32
+        adcs    xzr, x3, x9
+        lsr     x9, x4, #32
+        adcs    x9, x4, x9
+        csetm   x5, cs
+        orr     x9, x9, x5
+        lsl     x7, x9, #32
+        lsr     x8, x9, #32
+        adds    x3, x3, x7
+        adc     x4, x4, x8
+        negs    x5, x9
+        sbcs    x7, x7, xzr
+        sbc     x8, x8, xzr
+        negs    x5, x5
+        sbcs    x6, x6, x7
+        sbcs    x2, x2, x8
+        sbcs    x3, x3, x9
+        sbcs    x4, x4, x9
+        adds    x5, x5, x4
+        mov     x7, #0xffffffff
+        and     x7, x7, x4
+        adcs    x6, x6, x7
+        adcs    x2, x2, xzr
+        mov     x7, #0xffffffff00000001
+        and     x7, x7, x4
+        adc     x3, x3, x7
+        cmp     xzr, xzr
+        extr    x9, x3, x2, #32
+        adcs    xzr, x2, x9
+        lsr     x9, x3, #32
+        adcs    x9, x3, x9
+        csetm   x4, cs
+        orr     x9, x9, x4
+        lsl     x7, x9, #32
+        lsr     x8, x9, #32
+        adds    x2, x2, x7
+        adc     x3, x3, x8
+        negs    x4, x9
+        sbcs    x7, x7, xzr
+        sbc     x8, x8, xzr
+        negs    x4, x4
+        sbcs    x5, x5, x7
+        sbcs    x6, x6, x8
+        sbcs    x2, x2, x9
+        sbcs    x3, x3, x9
+        adds    x4, x4, x3
+        mov     x7, #0xffffffff
+        and     x7, x7, x3
+        adcs    x5, x5, x7
+        adcs    x6, x6, xzr
+        mov     x7, #0xffffffff00000001
+        and     x7, x7, x3
+        adc     x2, x2, x7
+        cmp     xzr, xzr
+        extr    x9, x2, x6, #32
+        adcs    xzr, x6, x9
+        lsr     x9, x2, #32
+        adcs    x9, x2, x9
+        csetm   x3, cs
+        orr     x9, x9, x3
+        lsl     x7, x9, #32
+        lsr     x8, x9, #32
+        adds    x6, x6, x7
+        adc     x2, x2, x8
+        negs    x3, x9
+        sbcs    x7, x7, xzr
+        sbc     x8, x8, xzr
+        negs    x3, x3
+        sbcs    x4, x4, x7
+        sbcs    x5, x5, x8
+        sbcs    x6, x6, x9
+        sbcs    x2, x2, x9
+        adds    x3, x3, x2
+        mov     x7, #0xffffffff
+        and     x7, x7, x2
+        adcs    x4, x4, x7
+        adcs    x5, x5, xzr
+        mov     x7, #0xffffffff00000001
+        and     x7, x7, x2
+        adc     x6, x6, x7
+        stp     x3, x4, [x0]
+        stp     x5, x6, [x0, #16]
+        ret
+
+p256_scalarmul_local_p256_montjadd:
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+        stp     x27, x30, [sp, #-16]!
+        sub     sp, sp, #0xe0
+        mov     x21, x0
+        mov     x22, x1
+        mov     x23, x2
+        mov     x0, sp
+        ldr     q19, [x22, #64]
+        ldp     x9, x13, [x22, #64]
+        ldr     q23, [x22, #80]
+        ldr     q0, [x22, #64]
+        ldp     x1, x10, [x22, #80]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x9, x13
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x9, x13
+        umulh   x15, x9, x1
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x9, x13
+        mov     x12, v1.d[0]
+        csetm   x5, cc  // cc = lo, ul, last
+        cneg    x6, x14, cc  // cc = lo, ul, last
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x10, x1
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc  // cc = lo, ul, last
+        cinv    x2, x5, cc  // cc = lo, ul, last
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x13, x10
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x1, x10
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x1, x10
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x10, x10
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x10, x10
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x1, x1
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x1, x1
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff                 // #4294967295
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x19, x3, x16, cs  // cs = hs, nlast
+        csel    x14, x8, x14, cs  // cs = hs, nlast
+        csel    x12, x11, x12, cs  // cs = hs, nlast
+        csel    x20, x5, x2, cs  // cs = hs, nlast
+        stp     x14, x12, [x0, #16]
+        stp     x19, x20, [x0]
+        ldr     q19, [x23, #64]
+        ldp     x9, x13, [x23, #64]
+        ldr     q23, [x23, #80]
+        ldr     q0, [x23, #64]
+        ldp     x1, x10, [x23, #80]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x9, x13
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x9, x13
+        umulh   x15, x9, x1
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x9, x13
+        mov     x12, v1.d[0]
+        csetm   x5, cc  // cc = lo, ul, last
+        cneg    x6, x14, cc  // cc = lo, ul, last
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x10, x1
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc  // cc = lo, ul, last
+        cinv    x2, x5, cc  // cc = lo, ul, last
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x13, x10
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x1, x10
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x1, x10
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x10, x10
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x10, x10
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x1, x1
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x1, x1
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff                 // #4294967295
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x16, x3, x16, cs  // cs = hs, nlast
+        csel    x14, x8, x14, cs  // cs = hs, nlast
+        csel    x12, x11, x12, cs  // cs = hs, nlast
+        csel    x2, x5, x2, cs  // cs = hs, nlast
+        stp     x14, x12, [sp, #176]
+        stp     x16, x2, [sp, #160]
+        ldr     q20, [x22, #32]
+        ldp     x7, x17, [x23, #64]
+        ldr     q0, [x23, #64]
+        ldp     x6, x10, [x22, #32]
+        ldp     x11, x15, [x23, #80]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [x22, #48]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [x23, #80]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [x22, #48]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x24, x3, x13
+        adcs    x25, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x24
+        adcs    x15, x16, x25
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x11, x11, x13
+        and     x1, x1, x13
+        adcs    x4, x4, x1
+        and     x1, x12, x13
+        stp     x11, x4, [sp, #192]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [sp, #208]
+        ldr     q20, [x23, #32]
+        ldp     x7, x17, [x22, #64]
+        ldr     q0, [x22, #64]
+        ldp     x6, x10, [x23, #32]
+        ldp     x11, x15, [x22, #80]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [x23, #48]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [x22, #80]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [x23, #48]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x24, x3, x13
+        adcs    x25, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x24
+        adcs    x15, x16, x25
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x24, x11, x13
+        and     x1, x1, x13
+        adcs    x25, x4, x1
+        and     x1, x12, x13
+        stp     x24, x25, [sp, #32]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [sp, #48]
+        mov     x1, sp
+        ldr     q20, [x23]
+        ldr     q0, [x1]
+        ldp     x6, x10, [x23]
+        ldp     x11, x15, [x1, #16]
+        rev64   v16.4s, v20.4s
+        subs    x4, x19, x20
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x20, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x19
+        ldr     q20, [x23, #16]
+        sbcs    x5, x15, x20
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x19, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [x1, #16]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [x23, #16]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x19, x3, x13
+        adcs    x20, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x19
+        adcs    x15, x16, x20
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x11, x11, x13
+        and     x1, x1, x13
+        adcs    x4, x4, x1
+        and     x1, x12, x13
+        stp     x11, x4, [sp, #64]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [sp, #80]
+        ldr     q20, [x22]
+        ldp     x7, x17, [sp, #160]
+        ldr     q0, [sp, #160]
+        ldp     x6, x10, [x22]
+        ldp     x11, x15, [sp, #176]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [x22, #16]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #176]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [x22, #16]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x19, x3, x13
+        adcs    x20, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x19
+        adcs    x15, x16, x20
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x11, x11, x13
+        and     x1, x1, x13
+        adcs    x4, x4, x1
+        and     x1, x12, x13
+        stp     x11, x4, [sp, #128]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [sp, #144]
+        mov     x1, sp
+        ldr     q20, [sp, #32]
+        ldp     x7, x17, [x1]
+        ldr     q0, [x1]
+        ldp     x11, x15, [x1, #16]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x25
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [sp, #48]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x24
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x25, x24
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [x1, #16]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [sp, #48]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x24, x7
+        sbcs    x9, x25, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x19, x3, x13
+        adcs    x20, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x24, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x25, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x19
+        adcs    x15, x16, x20
+        eor     x5, x17, x4
+        adcs    x9, x1, x24
+        eor     x1, x10, x5
+        adcs    x16, x2, x25
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x19, x11, x13
+        and     x1, x1, x13
+        adcs    x20, x4, x1
+        and     x1, x12, x13
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [sp, #48]
+        ldr     q20, [sp, #192]
+        ldp     x7, x17, [sp, #160]
+        ldr     q0, [sp, #160]
+        ldp     x6, x10, [sp, #192]
+        ldp     x11, x15, [sp, #176]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [sp, #208]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #176]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [sp, #208]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x24, x3, x13
+        adcs    x25, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x24
+        adcs    x15, x16, x25
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x9, x11, x13
+        and     x1, x1, x13
+        adcs    x10, x4, x1
+        and     x1, x12, x13
+        stp     x9, x10, [sp, #192]
+        adcs    x11, x7, xzr
+        adc     x12, x17, x1
+        stp     x11, x12, [sp, #208]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x13, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x24, x6, x4
+        adcs    x25, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x26, x8, x4
+        stp     x13, x24, [sp, #160]
+        stp     x25, x26, [sp, #176]
+        subs    x5, x19, x9
+        sbcs    x6, x20, x10
+        ldp     x7, x8, [sp, #48]
+        sbcs    x7, x7, x11
+        sbcs    x8, x8, x12
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x19, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x20, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x19, x20, [sp, #32]
+        stp     x7, x8, [sp, #48]
+        ldr     q19, [sp, #160]
+        ldr     q23, [sp, #176]
+        ldr     q0, [sp, #160]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x13, x24
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x13, x24
+        umulh   x15, x13, x25
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x13, x24
+        mov     x12, v1.d[0]
+        csetm   x5, cc  // cc = lo, ul, last
+        cneg    x6, x14, cc  // cc = lo, ul, last
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x26, x25
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc  // cc = lo, ul, last
+        cinv    x2, x5, cc  // cc = lo, ul, last
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x24, x26
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x25, x26
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x25, x26
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x26, x26
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x26, x26
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x25, x25
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x25, x25
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff                 // #4294967295
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x24, x3, x16, cs  // cs = hs, nlast
+        csel    x25, x8, x14, cs  // cs = hs, nlast
+        csel    x26, x11, x12, cs  // cs = hs, nlast
+        csel    x27, x5, x2, cs  // cs = hs, nlast
+        stp     x25, x26, [sp, #112]
+        stp     x24, x27, [sp, #96]
+        mov     x0, sp
+        ldr     q19, [sp, #32]
+        ldr     q23, [sp, #48]
+        ldr     q0, [sp, #32]
+        ldp     x1, x10, [sp, #48]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x19, x20
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x19, x20
+        umulh   x15, x19, x1
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x19, x20
+        mov     x12, v1.d[0]
+        csetm   x5, cc  // cc = lo, ul, last
+        cneg    x6, x14, cc  // cc = lo, ul, last
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x10, x1
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc  // cc = lo, ul, last
+        cinv    x2, x5, cc  // cc = lo, ul, last
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x20, x10
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x1, x10
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x1, x10
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x10, x10
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x10, x10
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x1, x1
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x1, x1
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff                 // #4294967295
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x16, x3, x16, cs  // cs = hs, nlast
+        csel    x14, x8, x14, cs  // cs = hs, nlast
+        csel    x12, x11, x12, cs  // cs = hs, nlast
+        csel    x2, x5, x2, cs  // cs = hs, nlast
+        stp     x14, x12, [x0, #16]
+        stp     x16, x2, [x0]
+        ldr     q20, [sp, #128]
+        ldr     q0, [sp, #96]
+        ldp     x6, x10, [sp, #128]
+        rev64   v16.4s, v20.4s
+        subs    x4, x24, x27
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x27, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x25, x24
+        ldr     q20, [sp, #144]
+        sbcs    x5, x26, x27
+        ngc     x17, xzr
+        subs    x8, x25, x26
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x24, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #112]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [sp, #144]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x19, x3, x13
+        adcs    x20, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x25, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x26, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x19
+        adcs    x15, x16, x20
+        eor     x5, x17, x4
+        adcs    x9, x1, x25
+        eor     x1, x10, x5
+        adcs    x16, x2, x26
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x19, x11, x13
+        and     x1, x1, x13
+        adcs    x20, x4, x1
+        and     x1, x12, x13
+        stp     x19, x20, [sp, #128]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [sp, #144]
+        ldr     q20, [sp, #64]
+        ldr     q0, [sp, #96]
+        ldp     x6, x10, [sp, #64]
+        ldp     x11, x15, [sp, #112]
+        rev64   v16.4s, v20.4s
+        subs    x4, x24, x27
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x27, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x24
+        ldr     q20, [sp, #80]
+        sbcs    x5, x15, x27
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x24, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #112]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [sp, #80]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x24, x3, x13
+        adcs    x25, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x24
+        adcs    x15, x16, x25
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x9, x11, x13
+        and     x1, x1, x13
+        adcs    x10, x4, x1
+        and     x1, x12, x13
+        stp     x9, x10, [sp, #64]
+        adcs    x11, x7, xzr
+        adc     x12, x17, x1
+        stp     x11, x12, [sp, #80]
+        mov     x0, sp
+        mov     x1, sp
+        ldp     x5, x6, [x1]
+        subs    x5, x5, x19
+        sbcs    x6, x6, x20
+        ldp     x7, x8, [x1, #16]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x24, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x25, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x7, x8, [x0, #16]
+        subs    x5, x9, x19
+        sbcs    x6, x10, x20
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x11, x4
+        sbcs    x8, x12, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #96]
+        stp     x7, x8, [sp, #112]
+        ldr     q20, [x22, #64]
+        ldp     x7, x17, [sp, #160]
+        ldr     q0, [sp, #160]
+        ldp     x6, x10, [x22, #64]
+        ldp     x11, x15, [sp, #176]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [x22, #80]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #176]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [x22, #80]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x19, x3, x13
+        adcs    x20, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x19
+        adcs    x15, x16, x20
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x11, x11, x13
+        and     x1, x1, x13
+        adcs    x4, x4, x1
+        and     x1, x12, x13
+        stp     x11, x4, [sp, #160]
+        adcs    x19, x7, xzr
+        adc     x20, x17, x1
+        stp     x19, x20, [sp, #176]
+        mov     x0, sp
+        mov     x1, sp
+        ldp     x4, x3, [sp, #64]
+        subs    x5, x24, x4
+        sbcs    x6, x25, x3
+        ldp     x7, x8, [x1, #16]
+        ldp     x4, x3, [sp, #80]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x9, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x10, x6, x4
+        adcs    x11, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x3, x8, x4
+        stp     x9, x10, [x0]
+        stp     x11, x3, [x0, #16]
+        ldp     x5, x6, [sp, #128]
+        subs    x5, x5, x9
+        sbcs    x6, x6, x10
+        ldp     x7, x8, [sp, #144]
+        sbcs    x7, x7, x11
+        sbcs    x8, x8, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldr     q20, [sp, #192]
+        ldp     x7, x17, [sp, #96]
+        ldr     q0, [sp, #96]
+        ldp     x6, x10, [sp, #192]
+        ldp     x11, x15, [sp, #112]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [sp, #208]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #112]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [sp, #208]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x24, x3, x13
+        adcs    x25, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x24
+        adcs    x15, x16, x25
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x11, x11, x13
+        and     x1, x1, x13
+        adcs    x4, x4, x1
+        and     x1, x12, x13
+        stp     x11, x4, [sp, #96]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [sp, #112]
+        ldr     q20, [x23, #64]
+        ldp     x7, x17, [sp, #160]
+        ldr     q0, [sp, #160]
+        ldp     x6, x10, [x23, #64]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x19, x7
+        ldr     q20, [x23, #80]
+        sbcs    x5, x20, x17
+        ngc     x17, xzr
+        subs    x8, x19, x20
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #176]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [x23, #80]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x19, x3, x13
+        adcs    x20, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x24, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x25, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x19
+        adcs    x15, x16, x20
+        eor     x5, x17, x4
+        adcs    x9, x1, x24
+        eor     x1, x10, x5
+        adcs    x16, x2, x25
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x19, x11, x13
+        and     x1, x1, x13
+        adcs    x20, x4, x1
+        and     x1, x12, x13
+        stp     x19, x20, [sp, #160]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [sp, #176]
+        ldr     q20, [sp, #128]
+        ldp     x7, x17, [sp, #32]
+        ldr     q0, [sp, #32]
+        ldp     x6, x10, [sp, #128]
+        ldp     x11, x15, [sp, #48]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [sp, #144]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #48]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [sp, #144]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x24, x3, x13
+        adcs    x25, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x24
+        adcs    x15, x16, x25
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x5, x11, x13
+        and     x1, x1, x13
+        adcs    x6, x4, x1
+        and     x1, x12, x13
+        adcs    x7, x7, xzr
+        adc     x9, x17, x1
+        ldp     x4, x3, [sp, #96]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x4, x3, [sp, #112]
+        sbcs    x7, x7, x4
+        sbcs    x8, x9, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x15, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x24, x6, x4
+        adcs    x25, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x26, x8, x4
+        stp     x15, x24, [sp, #128]
+        stp     x25, x26, [sp, #144]
+        ldp     x0, x1, [x22, #64]
+        ldp     x2, x3, [x22, #80]
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne  // ne = any
+        ldp     x4, x5, [x23, #64]
+        ldp     x6, x7, [x23, #80]
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne  // ne = any
+        cmp     x13, x12
+        csel    x8, x0, x19, cc  // cc = lo, ul, last
+        csel    x9, x1, x20, cc  // cc = lo, ul, last
+        csel    x8, x4, x8, hi  // hi = pmore
+        csel    x9, x5, x9, hi  // hi = pmore
+        ldp     x10, x11, [sp, #176]
+        csel    x10, x2, x10, cc  // cc = lo, ul, last
+        csel    x11, x3, x11, cc  // cc = lo, ul, last
+        csel    x10, x6, x10, hi  // hi = pmore
+        csel    x11, x7, x11, hi  // hi = pmore
+        ldp     x12, x13, [x22]
+        ldp     x0, x1, [sp]
+        csel    x0, x12, x0, cc  // cc = lo, ul, last
+        csel    x1, x13, x1, cc  // cc = lo, ul, last
+        ldp     x12, x13, [x23]
+        csel    x0, x12, x0, hi  // hi = pmore
+        csel    x1, x13, x1, hi  // hi = pmore
+        ldp     x12, x13, [x22, #16]
+        ldp     x2, x3, [sp, #16]
+        csel    x2, x12, x2, cc  // cc = lo, ul, last
+        csel    x3, x13, x3, cc  // cc = lo, ul, last
+        ldp     x12, x13, [x23, #16]
+        csel    x2, x12, x2, hi  // hi = pmore
+        csel    x3, x13, x3, hi  // hi = pmore
+        ldp     x12, x13, [x22, #32]
+        csel    x4, x12, x15, cc  // cc = lo, ul, last
+        csel    x5, x13, x24, cc  // cc = lo, ul, last
+        ldp     x12, x13, [x23, #32]
+        csel    x4, x12, x4, hi  // hi = pmore
+        csel    x5, x13, x5, hi  // hi = pmore
+        ldp     x12, x13, [x22, #48]
+        csel    x6, x12, x25, cc  // cc = lo, ul, last
+        csel    x7, x13, x26, cc  // cc = lo, ul, last
+        ldp     x12, x13, [x23, #48]
+        csel    x6, x12, x6, hi  // hi = pmore
+        csel    x7, x13, x7, hi  // hi = pmore
+        stp     x0, x1, [x21]
+        stp     x2, x3, [x21, #16]
+        stp     x4, x5, [x21, #32]
+        stp     x6, x7, [x21, #48]
+        stp     x8, x9, [x21, #64]
+        stp     x10, x11, [x21, #80]
+        add     sp, sp, #0xe0
+        ldp     x27, x30, [sp], #16
+        ldp     x25, x26, [sp], #16
+        ldp     x23, x24, [sp], #16
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+        ret
+
+p256_scalarmul_local_p256_montjdouble:
+        sub     sp, sp, #0x110
+        stp     x19, x20, [sp, #192]
+        stp     x21, x22, [sp, #208]
+        stp     x23, x24, [sp, #224]
+        stp     x25, x26, [sp, #240]
+        stp     x27, xzr, [sp, #256]
+        mov     x19, x0
+        mov     x20, x1
+        mov     x0, sp
+        ldr     q19, [x20, #64]
+        ldp     x9, x13, [x20, #64]
+        ldr     q23, [x20, #80]
+        ldr     q0, [x20, #64]
+        ldp     x1, x10, [x20, #80]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x9, x13
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x9, x13
+        umulh   x15, x9, x1
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x9, x13
+        mov     x12, v1.d[0]
+        csetm   x5, cc  // cc = lo, ul, last
+        cneg    x6, x14, cc  // cc = lo, ul, last
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x10, x1
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc  // cc = lo, ul, last
+        cinv    x2, x5, cc  // cc = lo, ul, last
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x13, x10
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x1, x10
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x1, x10
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x10, x10
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x10, x10
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x1, x1
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x1, x1
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff                 // #4294967295
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x21, x3, x16, cs  // cs = hs, nlast
+        csel    x22, x8, x14, cs  // cs = hs, nlast
+        csel    x23, x11, x12, cs  // cs = hs, nlast
+        csel    x24, x5, x2, cs  // cs = hs, nlast
+        stp     x22, x23, [x0, #16]
+        stp     x21, x24, [x0]
+        ldr     q19, [x20, #32]
+        ldp     x9, x13, [x20, #32]
+        ldr     q23, [x20, #48]
+        ldr     q0, [x20, #32]
+        ldp     x1, x10, [x20, #48]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x9, x13
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x9, x13
+        umulh   x15, x9, x1
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x9, x13
+        mov     x12, v1.d[0]
+        csetm   x5, cc  // cc = lo, ul, last
+        cneg    x6, x14, cc  // cc = lo, ul, last
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x10, x1
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc  // cc = lo, ul, last
+        cinv    x2, x5, cc  // cc = lo, ul, last
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x13, x10
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x1, x10
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x1, x10
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x10, x10
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x10, x10
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x1, x1
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x1, x1
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff                 // #4294967295
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x16, x3, x16, cs  // cs = hs, nlast
+        csel    x14, x8, x14, cs  // cs = hs, nlast
+        csel    x12, x11, x12, cs  // cs = hs, nlast
+        csel    x2, x5, x2, cs  // cs = hs, nlast
+        stp     x14, x12, [sp, #48]
+        stp     x16, x2, [sp, #32]
+        ldp     x5, x6, [x20]
+        subs    x5, x5, x21
+        sbcs    x6, x6, x24
+        ldp     x7, x8, [x20, #16]
+        sbcs    x7, x7, x22
+        sbcs    x8, x8, x23
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x10, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x25, x6, x4
+        adcs    x26, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x27, x8, x4
+        stp     x10, x25, [sp, #96]
+        stp     x26, x27, [sp, #112]
+        ldp     x5, x6, [x20]
+        adds    x5, x5, x21
+        adcs    x6, x6, x24
+        ldp     x7, x8, [x20, #16]
+        adcs    x7, x7, x22
+        adcs    x8, x8, x23
+        csetm   x3, cs  // cs = hs, nlast
+        subs    x9, x5, x3
+        and     x1, x3, #0xffffffff
+        sbcs    x5, x6, x1
+        sbcs    x7, x7, xzr
+        and     x2, x3, #0xffffffff00000001
+        sbc     x8, x8, x2
+        stp     x9, x5, [sp, #64]
+        stp     x7, x8, [sp, #80]
+        ldr     q20, [sp, #96]
+        ldr     q0, [sp, #64]
+        rev64   v16.4s, v20.4s
+        subs    x4, x9, x5
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x5, x25
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x7, x9
+        ldr     q20, [sp, #112]
+        sbcs    x5, x8, x5
+        ngc     x17, xzr
+        subs    x8, x7, x8
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x9, x10
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x25, x10
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #80]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x10, x26
+        sbcs    x9, x25, x27
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x27, x26
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x21, x3, x13
+        adcs    x22, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x23, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x24, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x21
+        adcs    x15, x16, x22
+        eor     x5, x17, x4
+        adcs    x9, x1, x23
+        eor     x1, x10, x5
+        adcs    x16, x2, x24
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x21, x11, x13
+        and     x1, x1, x13
+        adcs    x22, x4, x1
+        and     x1, x12, x13
+        stp     x21, x22, [sp, #96]
+        adcs    x23, x7, xzr
+        adc     x24, x17, x1
+        stp     x23, x24, [sp, #112]
+        ldp     x4, x5, [x20, #32]
+        ldp     x8, x9, [x20, #64]
+        adds    x4, x4, x8
+        adcs    x5, x5, x9
+        ldp     x6, x7, [x20, #48]
+        ldp     x10, x11, [x20, #80]
+        adcs    x6, x6, x10
+        adcs    x7, x7, x11
+        adc     x3, xzr, xzr
+        adds    x8, x4, #0x1
+        mov     x9, #0xffffffff                 // #4294967295
+        sbcs    x9, x5, x9
+        sbcs    x10, x6, xzr
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        sbcs    x11, x7, x11
+        sbcs    x3, x3, xzr
+        csel    x4, x4, x8, cc  // cc = lo, ul, last
+        csel    x5, x5, x9, cc  // cc = lo, ul, last
+        csel    x6, x6, x10, cc  // cc = lo, ul, last
+        csel    x7, x7, x11, cc  // cc = lo, ul, last
+        stp     x4, x5, [sp, #64]
+        stp     x6, x7, [sp, #80]
+        ldr     q20, [sp, #32]
+        ldp     x7, x17, [x20]
+        ldr     q0, [x20]
+        ldp     x6, x10, [sp, #32]
+        ldp     x11, x15, [x20, #16]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [sp, #48]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [x20, #16]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [sp, #48]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x20, x3, x13
+        adcs    x25, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x26, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x27, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x20
+        adcs    x15, x16, x25
+        eor     x5, x17, x4
+        adcs    x9, x1, x26
+        eor     x1, x10, x5
+        adcs    x16, x2, x27
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x20, x11, x13
+        and     x1, x1, x13
+        adcs    x25, x4, x1
+        and     x1, x12, x13
+        stp     x20, x25, [sp, #128]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [sp, #144]
+        ldr     q19, [sp, #96]
+        ldr     q23, [sp, #112]
+        ldr     q0, [sp, #96]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x21, x22
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x21, x22
+        umulh   x15, x21, x23
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x21, x22
+        mov     x12, v1.d[0]
+        csetm   x5, cc  // cc = lo, ul, last
+        cneg    x6, x14, cc  // cc = lo, ul, last
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x24, x23
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc  // cc = lo, ul, last
+        cinv    x2, x5, cc  // cc = lo, ul, last
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x22, x24
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x23, x24
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x23, x24
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x24, x24
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x24, x24
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x23, x23
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x23, x23
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff                 // #4294967295
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x21, x3, x16, cs  // cs = hs, nlast
+        csel    x22, x8, x14, cs  // cs = hs, nlast
+        csel    x23, x11, x12, cs  // cs = hs, nlast
+        csel    x24, x5, x2, cs  // cs = hs, nlast
+        ldr     q19, [sp, #64]
+        ldp     x9, x13, [sp, #64]
+        ldr     q23, [sp, #80]
+        ldr     q0, [sp, #64]
+        ldp     x1, x10, [sp, #80]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x9, x13
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x9, x13
+        umulh   x15, x9, x1
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x9, x13
+        mov     x12, v1.d[0]
+        csetm   x5, cc  // cc = lo, ul, last
+        cneg    x6, x14, cc  // cc = lo, ul, last
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x10, x1
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc  // cc = lo, ul, last
+        cinv    x2, x5, cc  // cc = lo, ul, last
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x13, x10
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x1, x10
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x1, x10
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x10, x10
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x10, x10
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x1, x1
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x1, x1
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff                 // #4294967295
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x13, x3, x16, cs  // cs = hs, nlast
+        csel    x14, x8, x14, cs  // cs = hs, nlast
+        csel    x15, x11, x12, cs  // cs = hs, nlast
+        csel    x26, x5, x2, cs  // cs = hs, nlast
+        mov     x1, #0x9                        // #9
+        mov     x2, #0xffffffffffffffff         // #-1
+        subs    x9, x2, x21
+        mov     x2, #0xffffffff                 // #4294967295
+        sbcs    x10, x2, x24
+        ngcs    x11, x22
+        mov     x2, #0xffffffff00000001         // #-4294967295
+        sbc     x12, x2, x23
+        mul     x3, x1, x9
+        mul     x4, x1, x10
+        mul     x5, x1, x11
+        mul     x6, x1, x12
+        umulh   x9, x1, x9
+        umulh   x10, x1, x10
+        umulh   x11, x1, x11
+        umulh   x7, x1, x12
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, xzr
+        mov     x1, #0xc                        // #12
+        mul     x8, x20, x1
+        umulh   x9, x20, x1
+        adds    x3, x3, x8
+        mul     x8, x25, x1
+        umulh   x10, x25, x1
+        adcs    x4, x4, x8
+        ldp     x11, x12, [sp, #144]
+        mul     x8, x11, x1
+        umulh   x11, x11, x1
+        adcs    x5, x5, x8
+        mul     x8, x12, x1
+        umulh   x12, x12, x1
+        adcs    x6, x6, x8
+        adc     x7, x7, xzr
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, x12
+        add     x8, x7, #0x1
+        lsl     x10, x8, #32
+        adds    x6, x6, x10
+        adc     x7, x7, xzr
+        neg     x9, x8
+        sub     x10, x10, #0x1
+        subs    x3, x3, x9
+        sbcs    x4, x4, x10
+        sbcs    x5, x5, xzr
+        sbcs    x6, x6, x8
+        sbc     x8, x7, x8
+        adds    x20, x3, x8
+        and     x9, x8, #0xffffffff
+        adcs    x21, x4, x9
+        adcs    x22, x5, xzr
+        neg     x10, x9
+        adc     x23, x6, x10
+        stp     x20, x21, [sp, #160]
+        stp     x22, x23, [sp, #176]
+        mov     x2, sp
+        ldp     x4, x3, [x2]
+        subs    x5, x13, x4
+        sbcs    x6, x26, x3
+        ldp     x4, x3, [x2, #16]
+        sbcs    x7, x14, x4
+        sbcs    x8, x15, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #64]
+        stp     x7, x8, [sp, #80]
+        mov     x0, sp
+        ldr     q19, [sp, #32]
+        ldp     x9, x13, [sp, #32]
+        ldr     q23, [sp, #48]
+        ldr     q0, [sp, #32]
+        ldp     x1, x10, [sp, #48]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x9, x13
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x9, x13
+        umulh   x15, x9, x1
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x9, x13
+        mov     x12, v1.d[0]
+        csetm   x5, cc  // cc = lo, ul, last
+        cneg    x6, x14, cc  // cc = lo, ul, last
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x10, x1
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc  // cc = lo, ul, last
+        cinv    x2, x5, cc  // cc = lo, ul, last
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x13, x10
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x1, x10
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x1, x10
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x10, x10
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x10, x10
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x1, x1
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x1, x1
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff                 // #4294967295
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001        // #-4294967295
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x24, x3, x16, cs  // cs = hs, nlast
+        csel    x25, x8, x14, cs  // cs = hs, nlast
+        csel    x26, x11, x12, cs  // cs = hs, nlast
+        csel    x27, x5, x2, cs  // cs = hs, nlast
+        stp     x25, x26, [x0, #16]
+        stp     x24, x27, [x0]
+        ldr     q20, [sp, #96]
+        ldr     q0, [sp, #160]
+        ldp     x6, x10, [sp, #96]
+        rev64   v16.4s, v20.4s
+        subs    x4, x20, x21
+        csetm   x3, cc  // cc = lo, ul, last
+        cneg    x13, x4, cc  // cc = lo, ul, last
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x21, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x22, x20
+        ldr     q20, [sp, #112]
+        sbcs    x5, x23, x21
+        ngc     x17, xzr
+        subs    x8, x22, x23
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x20, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc  // cc = lo, ul, last
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc  // cc = lo, ul, last
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc  // cc = lo, ul, last
+        cinv    x9, x3, cc  // cc = lo, ul, last
+        cmn     x17, #0x1
+        ldr     q28, [sp, #176]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [sp, #112]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc  // cc = lo, ul, last
+        cneg    x6, x6, cc  // cc = lo, ul, last
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x20, x3, x13
+        adcs    x21, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x22, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x23, x15, xzr
+        rev64   v24.4s, v20.4s
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc  // cc = lo, ul, last
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc  // cc = lo, ul, last
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc  // cc = lo, ul, last
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc  // cc = lo, ul, last
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x20
+        adcs    x15, x16, x21
+        eor     x5, x17, x4
+        adcs    x9, x1, x22
+        eor     x1, x10, x5
+        adcs    x16, x2, x23
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff                 // #4294967295
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001        // #-4294967295
+        adds    x14, x11, x13
+        and     x1, x1, x13
+        adcs    x15, x4, x1
+        and     x1, x12, x13
+        stp     x14, x15, [sp, #96]
+        adcs    x13, x7, xzr
+        adc     x20, x17, x1
+        stp     x13, x20, [sp, #112]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp, #32]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #48]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x5, x6, [x19, #64]
+        stp     x7, x8, [x19, #80]
+        ldp     x1, x2, [sp, #128]
+        lsl     x0, x1, #2
+        ldp     x6, x7, [sp, #160]
+        subs    x0, x0, x6
+        extr    x1, x2, x1, #62
+        sbcs    x1, x1, x7
+        ldp     x3, x4, [sp, #144]
+        extr    x2, x3, x2, #62
+        ldp     x6, x7, [sp, #176]
+        sbcs    x2, x2, x6
+        extr    x3, x4, x3, #62
+        sbcs    x3, x3, x7
+        lsr     x4, x4, #62
+        sbc     x4, x4, xzr
+        add     x5, x4, #0x1
+        lsl     x8, x5, #32
+        negs    x6, x8
+        ngcs    x7, xzr
+        sbc     x8, x8, x5
+        adds    x0, x0, x5
+        adcs    x1, x1, x6
+        adcs    x2, x2, x7
+        adcs    x3, x3, x8
+        csetm   x5, cc  // cc = lo, ul, last
+        adds    x0, x0, x5
+        and     x6, x5, #0xffffffff
+        adcs    x1, x1, x6
+        adcs    x2, x2, xzr
+        neg     x7, x6
+        adc     x3, x3, x7
+        stp     x0, x1, [x19]
+        stp     x2, x3, [x19, #16]
+        mov     x2, #0xffffffffffffffff         // #-1
+        subs    x9, x2, x24
+        mov     x2, #0xffffffff                 // #4294967295
+        sbcs    x10, x2, x27
+        ngcs    x11, x25
+        mov     x2, #0xffffffff00000001         // #-4294967295
+        sbc     x12, x2, x26
+        lsl     x3, x9, #3
+        extr    x4, x10, x9, #61
+        extr    x5, x11, x10, #61
+        extr    x6, x12, x11, #61
+        lsr     x7, x12, #61
+        mov     x1, #0x3                        // #3
+        mul     x8, x14, x1
+        umulh   x9, x14, x1
+        adds    x3, x3, x8
+        mul     x8, x15, x1
+        umulh   x10, x15, x1
+        adcs    x4, x4, x8
+        mul     x8, x13, x1
+        umulh   x11, x13, x1
+        adcs    x5, x5, x8
+        mul     x8, x20, x1
+        umulh   x12, x20, x1
+        adcs    x6, x6, x8
+        adc     x7, x7, xzr
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, x12
+        add     x8, x7, #0x1
+        lsl     x10, x8, #32
+        adds    x6, x6, x10
+        adc     x7, x7, xzr
+        neg     x9, x8
+        sub     x10, x10, #0x1
+        subs    x3, x3, x9
+        sbcs    x4, x4, x10
+        sbcs    x5, x5, xzr
+        sbcs    x6, x6, x8
+        sbc     x8, x7, x8
+        adds    x3, x3, x8
+        and     x9, x8, #0xffffffff
+        adcs    x4, x4, x9
+        adcs    x5, x5, xzr
+        neg     x10, x9
+        adc     x6, x6, x10
+        stp     x3, x4, [x19, #32]
+        stp     x5, x6, [x19, #48]
+        ldp     x27, xzr, [sp, #256]
+        ldp     x25, x26, [sp, #240]
+        ldp     x23, x24, [sp, #224]
+        ldp     x21, x22, [sp, #208]
+        ldp     x19, x20, [sp, #192]
+        add     sp, sp, #0x110
+        ret
+
+p256_scalarmul_local_p256_montjmixadd:
+        stp     x19, x20, [sp, #-16]!
+        sub     sp, sp, #0xc0
+        mov     x17, x0
+        mov     x19, x1
+        mov     x20, x2
+        ldp     x2, x3, [x19, #64]
+        ldp     x4, x5, [x19, #80]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        subs    x13, x15, x12
+        lsr     x11, x15, #32
+        sbc     x15, x15, x11
+        adds    x16, x16, x12
+        adcs    x0, x0, x11
+        adcs    x1, x1, x13
+        adc     x15, x15, xzr
+        lsl     x12, x16, #32
+        subs    x13, x16, x12
+        lsr     x11, x16, #32
+        sbc     x16, x16, x11
+        adds    x0, x0, x12
+        adcs    x1, x1, x11
+        adcs    x15, x15, x13
+        adc     x16, x16, xzr
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, cc
+        csetm   x13, cc
+        subs    x12, x5, x4
+        cneg    x12, x12, cc
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, cc
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        subs    x13, x6, x12
+        lsr     x11, x6, #32
+        sbc     x6, x6, x11
+        adds    x7, x7, x12
+        adcs    x8, x8, x11
+        adcs    x9, x9, x13
+        adcs    x10, x10, x6
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        subs    x13, x7, x12
+        lsr     x11, x7, #32
+        sbc     x7, x7, x11
+        adds    x8, x8, x12
+        adcs    x9, x9, x11
+        adcs    x10, x10, x13
+        adcs    x6, x6, x7
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #0xffffffff
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, cs
+        csel    x9, x11, x9, cs
+        csel    x10, x12, x10, cs
+        csel    x6, x13, x6, cs
+        stp     x8, x9, [sp]
+        stp     x10, x6, [sp, #16]
+        ldp     x3, x4, [x19, #64]
+        ldp     x5, x6, [x19, #80]
+        ldp     x7, x8, [x20, #32]
+        ldp     x9, x10, [x20, #48]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #32]
+        stp     x11, x12, [sp, #48]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #32]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #48]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #32]
+        stp     x3, x4, [sp, #48]
+        ldp     x3, x4, [sp]
+        ldp     x5, x6, [sp, #16]
+        ldp     x7, x8, [x20]
+        ldp     x9, x10, [x20, #16]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #64]
+        stp     x11, x12, [sp, #80]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #64]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #80]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #64]
+        stp     x3, x4, [sp, #80]
+        ldp     x3, x4, [sp]
+        ldp     x5, x6, [sp, #16]
+        ldp     x7, x8, [sp, #32]
+        ldp     x9, x10, [sp, #48]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #32]
+        stp     x11, x12, [sp, #48]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #32]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #48]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #32]
+        stp     x3, x4, [sp, #48]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [x19]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [x19, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #160]
+        stp     x7, x8, [sp, #176]
+        ldp     x5, x6, [sp, #32]
+        ldp     x4, x3, [x19, #32]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #48]
+        ldp     x4, x3, [x19, #48]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #32]
+        stp     x7, x8, [sp, #48]
+        ldp     x2, x3, [sp, #160]
+        ldp     x4, x5, [sp, #176]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        subs    x13, x15, x12
+        lsr     x11, x15, #32
+        sbc     x15, x15, x11
+        adds    x16, x16, x12
+        adcs    x0, x0, x11
+        adcs    x1, x1, x13
+        adc     x15, x15, xzr
+        lsl     x12, x16, #32
+        subs    x13, x16, x12
+        lsr     x11, x16, #32
+        sbc     x16, x16, x11
+        adds    x0, x0, x12
+        adcs    x1, x1, x11
+        adcs    x15, x15, x13
+        adc     x16, x16, xzr
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, cc
+        csetm   x13, cc
+        subs    x12, x5, x4
+        cneg    x12, x12, cc
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, cc
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        subs    x13, x6, x12
+        lsr     x11, x6, #32
+        sbc     x6, x6, x11
+        adds    x7, x7, x12
+        adcs    x8, x8, x11
+        adcs    x9, x9, x13
+        adcs    x10, x10, x6
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        subs    x13, x7, x12
+        lsr     x11, x7, #32
+        sbc     x7, x7, x11
+        adds    x8, x8, x12
+        adcs    x9, x9, x11
+        adcs    x10, x10, x13
+        adcs    x6, x6, x7
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #0xffffffff
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, cs
+        csel    x9, x11, x9, cs
+        csel    x10, x12, x10, cs
+        csel    x6, x13, x6, cs
+        stp     x8, x9, [sp, #96]
+        stp     x10, x6, [sp, #112]
+        ldp     x2, x3, [sp, #32]
+        ldp     x4, x5, [sp, #48]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        subs    x13, x15, x12
+        lsr     x11, x15, #32
+        sbc     x15, x15, x11
+        adds    x16, x16, x12
+        adcs    x0, x0, x11
+        adcs    x1, x1, x13
+        adc     x15, x15, xzr
+        lsl     x12, x16, #32
+        subs    x13, x16, x12
+        lsr     x11, x16, #32
+        sbc     x16, x16, x11
+        adds    x0, x0, x12
+        adcs    x1, x1, x11
+        adcs    x15, x15, x13
+        adc     x16, x16, xzr
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, cc
+        csetm   x13, cc
+        subs    x12, x5, x4
+        cneg    x12, x12, cc
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, cc
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        subs    x13, x6, x12
+        lsr     x11, x6, #32
+        sbc     x6, x6, x11
+        adds    x7, x7, x12
+        adcs    x8, x8, x11
+        adcs    x9, x9, x13
+        adcs    x10, x10, x6
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        subs    x13, x7, x12
+        lsr     x11, x7, #32
+        sbc     x7, x7, x11
+        adds    x8, x8, x12
+        adcs    x9, x9, x11
+        adcs    x10, x10, x13
+        adcs    x6, x6, x7
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #0xffffffff
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, cs
+        csel    x9, x11, x9, cs
+        csel    x10, x12, x10, cs
+        csel    x6, x13, x6, cs
+        stp     x8, x9, [sp]
+        stp     x10, x6, [sp, #16]
+        ldp     x3, x4, [sp, #96]
+        ldp     x5, x6, [sp, #112]
+        ldp     x7, x8, [x19]
+        ldp     x9, x10, [x19, #16]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #128]
+        stp     x11, x12, [sp, #144]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #128]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #144]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #128]
+        stp     x3, x4, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x5, x6, [sp, #112]
+        ldp     x7, x8, [sp, #64]
+        ldp     x9, x10, [sp, #80]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #64]
+        stp     x11, x12, [sp, #80]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #64]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #80]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #64]
+        stp     x3, x4, [sp, #80]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #96]
+        stp     x7, x8, [sp, #112]
+        ldp     x3, x4, [sp, #160]
+        ldp     x5, x6, [sp, #176]
+        ldp     x7, x8, [x19, #64]
+        ldp     x9, x10, [x19, #80]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #160]
+        stp     x11, x12, [sp, #176]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #160]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #176]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #160]
+        stp     x3, x4, [sp, #176]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #64]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #80]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x5, x6, [sp, #112]
+        ldp     x7, x8, [x19, #32]
+        ldp     x9, x10, [x19, #48]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #96]
+        stp     x11, x12, [sp, #112]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #96]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #112]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #96]
+        stp     x3, x4, [sp, #112]
+        ldp     x3, x4, [sp, #32]
+        ldp     x5, x6, [sp, #48]
+        ldp     x7, x8, [sp, #128]
+        ldp     x9, x10, [sp, #144]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #128]
+        stp     x11, x12, [sp, #144]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #128]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #144]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #128]
+        stp     x3, x4, [sp, #144]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp, #96]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #112]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x0, x1, [x19, #64]
+        ldp     x2, x3, [x19, #80]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+        ldp     x0, x1, [sp]
+        ldp     x12, x13, [x20]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [sp, #16]
+        ldp     x12, x13, [x20, #16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+        ldp     x4, x5, [sp, #128]
+        ldp     x12, x13, [x20, #32]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [sp, #144]
+        ldp     x12, x13, [x20, #48]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+        ldp     x8, x9, [sp, #160]
+        mov     x12, #0x1
+        mov     x13, #0xffffffff00000000
+        csel    x8, x8, x12, ne
+        csel    x9, x9, x13, ne
+        ldp     x10, x11, [sp, #176]
+        mov     x12, #0xffffffffffffffff
+        mov     x13, #0xfffffffe
+        csel    x10, x10, x12, ne
+        csel    x11, x11, x13, ne
+        stp     x0, x1, [x17]
+        stp     x2, x3, [x17, #16]
+        stp     x4, x5, [x17, #32]
+        stp     x6, x7, [x17, #48]
+        stp     x8, x9, [x17, #64]
+        stp     x10, x11, [x17, #80]
+        add     sp, sp, #0xc0
+        ldp     x19, x20, [sp], #16
+        ret
+
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul_alt.S
new file mode 100644
index 00000000000..77e3349e34c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul_alt.S
@@ -0,0 +1,6190 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Scalar multiplication for P-256
+// Input scalar[4], point[8]; output res[8]
+//
+// extern void p256_scalarmul_alt
+//   (uint64_t res[static 8],
+//    uint64_t scalar[static 4],
+//    uint64_t point[static 8]);
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve P-256, returns the point (X,Y) = n * P. The input and output
+// are affine points, and in the case of the point at infinity as
+// the result, (0,0) is returned.
+//
+// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmul_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmul_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Safe copies of inputs (res lasts the whole code, point not so long)
+// and additional values in variables, with some aliasing
+
+#define res x19
+#define sgn x20
+#define j x20
+#define point x21
+
+// Intermediate variables on the stack. The last z2, z3 values can
+// safely be overlaid on the table, which is no longer needed at the end.
+
+#define scalarb sp, #(0*NUMSIZE)
+#define acc sp, #(1*NUMSIZE)
+#define tabent sp, #(4*NUMSIZE)
+
+#define tab sp, #(7*NUMSIZE)
+
+#define z2 sp, #(7*NUMSIZE)
+#define z3 sp, #(8*NUMSIZE)
+
+#define NSPACE #(31*NUMSIZE)
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(p256_scalarmul_alt):
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x30, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Preserve the "res" and "point" input arguments. We load and process the
+// scalar immediately so we don't bother preserving that input argument.
+// Also, "point" is only needed early on and so its register gets re-used.
+
+        mov     res, x0
+        mov     point, x2
+
+// Load the digits of group order n_256 = [x12;x13;x14;x15]
+
+        movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551)
+        movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84)
+        mov     x14, #0xffffffffffffffff
+        mov     x15, #0xffffffff00000000
+
+// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+
+        subs    x6, x2, x12
+        sbcs    x7, x3, x13
+        sbcs    x8, x4, x14
+        sbcs    x9, x5, x15
+
+        csel    x2, x2, x6, cc
+        csel    x3, x3, x7, cc
+        csel    x4, x4, x8, cc
+        csel    x5, x5, x9, cc
+
+// Now if the top bit of the reduced scalar is set, negate it mod n_256,
+// i.e. do n |-> n_256 - n. Remember the sign as "sgn" so we can
+// correspondingly negate the point below.
+
+        subs    x6, x12, x2
+        sbcs    x7, x13, x3
+        sbcs    x8, x14, x4
+        sbc     x9, x15, x5
+
+        tst     x5, #0x8000000000000000
+        csel    x2, x2, x6, eq
+        csel    x3, x3, x7, eq
+        csel    x4, x4, x8, eq
+        csel    x5, x5, x9, eq
+        cset    sgn, ne
+
+// In either case then add the recoding constant 0x08888...888 to allow
+// signed digits.
+
+        mov     x6, 0x8888888888888888
+        adds    x2, x2, x6
+        adcs    x3, x3, x6
+        bic     x7, x6, #0xF000000000000000
+        adcs    x4, x4, x6
+        adc     x5, x5, x7
+
+        stp     x2, x3, [scalarb]
+        stp     x4, x5, [scalarb+16]
+
+// Set the tab[0] table entry to Montgomery-Jacobian point = 1 * P
+// The z coordinate is just the Montgomery form of the constant 1.
+
+        add     x0, tab
+        mov     x1, point
+        bl      p256_scalarmul_alt_local_tomont_p256
+
+        add     x1, point, #32
+        add     x0, tab+32
+        bl      p256_scalarmul_alt_local_tomont_p256
+
+        mov     x0, #0x0000000000000001
+        mov     x1, #0xffffffff00000000
+        stp     x0, x1, [tab+64]
+        mov     x2, #0xffffffffffffffff
+        mov     x3, #0x00000000fffffffe
+        stp     x2, x3, [tab+80]
+
+// If the top bit of the scalar was set, negate (y coordinate of) the point
+
+        ldp     x4, x5, [tab+32]
+        ldp     x6, x7, [tab+48]
+
+        mov     x0, 0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, 0x00000000ffffffff
+        sbcs    x1, x1, x5
+        mov     x3, 0xffffffff00000001
+        sbcs    x2, xzr, x6
+        sbc     x3, x3, x7
+
+        cmp     sgn, xzr
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tab+32]
+        stp     x6, x7, [tab+48]
+
+// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P
+
+        add     x0, tab+96*1
+        add     x1, tab
+        bl      p256_scalarmul_alt_local_p256_montjdouble
+
+        add     x0, tab+96*2
+        add     x1, tab+96*1
+        add     x2, tab
+        bl      p256_scalarmul_alt_local_p256_montjmixadd
+
+        add     x0, tab+96*3
+        add     x1, tab+96*1
+        bl      p256_scalarmul_alt_local_p256_montjdouble
+
+        add     x0, tab+96*4
+        add     x1, tab+96*3
+        add     x2, tab
+        bl      p256_scalarmul_alt_local_p256_montjmixadd
+
+        add     x0, tab+96*5
+        add     x1, tab+96*2
+        bl      p256_scalarmul_alt_local_p256_montjdouble
+
+        add     x0, tab+96*6
+        add     x1, tab+96*5
+        add     x2, tab
+        bl      p256_scalarmul_alt_local_p256_montjmixadd
+
+        add     x0, tab+96*7
+        add     x1, tab+96*3
+        bl      p256_scalarmul_alt_local_p256_montjdouble
+
+// Initialize the accumulator as a table entry for top 4 bits (unrecoded)
+
+        ldr     x14, [scalarb+24]
+        lsr     x14, x14, #60
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        add     x15, tab
+
+        .set i, 1
+.rep 8
+        cmp     x14, #i
+        ldp     x12, x13, [x15]
+        csel    x0, x12, x0, eq
+        csel    x1, x13, x1, eq
+        ldp     x12, x13, [x15, #16]
+        csel    x2, x12, x2, eq
+        csel    x3, x13, x3, eq
+        ldp     x12, x13, [x15, #32]
+        csel    x4, x12, x4, eq
+        csel    x5, x13, x5, eq
+        ldp     x12, x13, [x15, #48]
+        csel    x6, x12, x6, eq
+        csel    x7, x13, x7, eq
+        ldp     x12, x13, [x15, #64]
+        csel    x8, x12, x8, eq
+        csel    x9, x13, x9, eq
+        ldp     x12, x13, [x15, #80]
+        csel    x10, x12, x10, eq
+        csel    x11, x13, x11, eq
+        add     x15, x15, #96
+        .set    i, (i+1)
+.endr
+        stp     x0, x1, [acc]
+        stp     x2, x3, [acc+16]
+        stp     x4, x5, [acc+32]
+        stp     x6, x7, [acc+48]
+        stp     x8, x9, [acc+64]
+        stp     x10, x11, [acc+80]
+
+        mov     j, #252
+
+// Main loop over size-4 bitfields: double 4 times then add signed digit
+
+p256_scalarmul_alt_loop:
+        sub     j, j, #4
+
+        add     x0, acc
+        add     x1, acc
+        bl      p256_scalarmul_alt_local_p256_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p256_scalarmul_alt_local_p256_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p256_scalarmul_alt_local_p256_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p256_scalarmul_alt_local_p256_montjdouble
+
+        lsr     x2, j, #6
+        ldr     x14, [sp, x2, lsl #3]   // Exploits scalarb = sp exactly
+        lsr     x14, x14, j
+        and     x14, x14, #15
+
+        subs    x14, x14, #8
+        cset    x16, lo                 // x16 = sign of digit (1 = negative)
+        cneg    x14, x14, lo            // x14 = absolute value of digit
+
+// Conditionally select the table entry tab[i-1] = i * P in constant time
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        add     x15, tab
+        .set i, 1
+.rep 8
+        cmp     x14, #i
+        ldp     x12, x13, [x15]
+        csel    x0, x12, x0, eq
+        csel    x1, x13, x1, eq
+        ldp     x12, x13, [x15, #16]
+        csel    x2, x12, x2, eq
+        csel    x3, x13, x3, eq
+        ldp     x12, x13, [x15, #32]
+        csel    x4, x12, x4, eq
+        csel    x5, x13, x5, eq
+        ldp     x12, x13, [x15, #48]
+        csel    x6, x12, x6, eq
+        csel    x7, x13, x7, eq
+        ldp     x12, x13, [x15, #64]
+        csel    x8, x12, x8, eq
+        csel    x9, x13, x9, eq
+        ldp     x12, x13, [x15, #80]
+        csel    x10, x12, x10, eq
+        csel    x11, x13, x11, eq
+        add     x15, x15, #96
+        .set    i, (i+1)
+.endr
+
+// Store it to "tabent" with the y coordinate optionally negated
+
+        stp     x0, x1, [tabent]
+        stp     x2, x3, [tabent+16]
+
+        mov     x0, 0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, 0x00000000ffffffff
+        sbcs    x1, x1, x5
+        mov     x3, 0xffffffff00000001
+        sbcs    x2, xzr, x6
+        sbc     x3, x3, x7
+
+        cmp     x16, xzr
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tabent+32]
+        stp     x6, x7, [tabent+48]
+        stp     x8, x9, [tabent+64]
+        stp     x10, x11, [tabent+80]
+
+        add     x0, acc
+        add     x1, acc
+        add     x2, tabent
+        bl      p256_scalarmul_alt_local_p256_montjadd
+
+        cbnz    j, p256_scalarmul_alt_loop
+
+// That's the end of the main loop, and we just need to translate
+// back from the Jacobian representation to affine. First of all,
+// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form
+
+        add     x0, z2
+        add     x1, acc+64
+        bl      p256_scalarmul_alt_local_montsqr_p256
+
+        add     x0, z3
+        add     x2, z2
+        add     x1, acc+64
+        bl      p256_scalarmul_alt_local_montmul_p256
+
+        add     x0, z2
+        add     x1, z3
+        bl      p256_scalarmul_alt_local_demont_p256
+
+        add     x0, z3
+        add     x1, z2
+        bl      p256_scalarmul_alt_local_inv_p256
+
+        add     x0, z2
+        add     x2, z3
+        add     x1, acc+64
+        bl      p256_scalarmul_alt_local_montmul_p256
+
+// Convert back from Jacobian (X,Y,Z) |-> (X/Z^2, Y/Z^3)
+
+        add     x1, acc
+        add     x2, z2
+        mov     x0, res
+        bl      p256_scalarmul_alt_local_montmul_p256
+
+        add     x0, res, #32
+        add     x1, acc+32
+        add     x2, z3
+        bl      p256_scalarmul_alt_local_montmul_p256
+
+// Restore stack and registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x21, x30, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+p256_scalarmul_alt_local_demont_p256:
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        lsl     x7, x2, #32
+        subs    x8, x2, x7
+        lsr     x6, x2, #32
+        sbc     x2, x2, x6
+        adds    x3, x3, x7
+        adcs    x4, x4, x6
+        adcs    x5, x5, x8
+        adc     x2, x2, xzr
+        lsl     x7, x3, #32
+        subs    x8, x3, x7
+        lsr     x6, x3, #32
+        sbc     x3, x3, x6
+        adds    x4, x4, x7
+        adcs    x5, x5, x6
+        adcs    x2, x2, x8
+        adc     x3, x3, xzr
+        lsl     x7, x4, #32
+        subs    x8, x4, x7
+        lsr     x6, x4, #32
+        sbc     x4, x4, x6
+        adds    x5, x5, x7
+        adcs    x2, x2, x6
+        adcs    x3, x3, x8
+        adc     x4, x4, xzr
+        lsl     x7, x5, #32
+        subs    x8, x5, x7
+        lsr     x6, x5, #32
+        sbc     x5, x5, x6
+        adds    x2, x2, x7
+        adcs    x3, x3, x6
+        adcs    x4, x4, x8
+        adc     x5, x5, xzr
+        stp     x2, x3, [x0]
+        stp     x4, x5, [x0, #16]
+        ret
+
+p256_scalarmul_alt_local_inv_p256:
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        sub     sp, sp, #0xa0
+        mov     x20, x0
+        mov     x10, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x13, #0xffffffff00000001
+        stp     x10, x11, [sp]
+        stp     xzr, x13, [sp, #16]
+        str     xzr, [sp, #32]
+        ldp     x2, x3, [x1]
+        subs    x10, x2, x10
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #16]
+        sbcs    x12, x4, xzr
+        sbcs    x13, x5, x13
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+        stp     x2, x3, [sp, #48]
+        stp     x4, x5, [sp, #64]
+        str     xzr, [sp, #80]
+        stp     xzr, xzr, [sp, #96]
+        stp     xzr, xzr, [sp, #112]
+        mov     x10, #0x4000000000000
+        stp     x10, xzr, [sp, #128]
+        stp     xzr, xzr, [sp, #144]
+        mov     x21, #0xa
+        mov     x22, #0x1
+        b       p256_scalarmul_alt_inv_midloop
+p256_scalarmul_alt_inv_loop:
+        cmp     x10, xzr
+        csetm   x14, mi
+        cneg    x10, x10, mi
+        cmp     x11, xzr
+        csetm   x15, mi
+        cneg    x11, x11, mi
+        cmp     x12, xzr
+        csetm   x16, mi
+        cneg    x12, x12, mi
+        cmp     x13, xzr
+        csetm   x17, mi
+        cneg    x13, x13, mi
+        and     x0, x10, x14
+        and     x1, x11, x15
+        add     x9, x0, x1
+        and     x0, x12, x16
+        and     x1, x13, x17
+        add     x19, x0, x1
+        ldr     x7, [sp]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #48]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x5, x19, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x7, [sp, #8]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #56]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [sp]
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [sp, #48]
+        ldr     x7, [sp, #16]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #64]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [sp, #8]
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [sp, #56]
+        ldr     x7, [sp, #24]
+        eor     x1, x7, x14
+        ldr     x23, [sp, #32]
+        eor     x3, x23, x14
+        and     x3, x3, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #72]
+        eor     x1, x8, x15
+        ldr     x24, [sp, #80]
+        eor     x0, x24, x15
+        and     x0, x0, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [sp, #16]
+        extr    x5, x3, x5, #59
+        str     x5, [sp, #24]
+        asr     x3, x3, #59
+        str     x3, [sp, #32]
+        eor     x1, x7, x16
+        eor     x5, x23, x16
+        and     x5, x5, x12
+        neg     x5, x5
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, x17
+        eor     x0, x24, x17
+        and     x0, x0, x13
+        sub     x5, x5, x0
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [sp, #64]
+        extr    x2, x5, x2, #59
+        str     x2, [sp, #72]
+        asr     x5, x5, #59
+        str     x5, [sp, #80]
+        ldr     x7, [sp, #96]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #128]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        str     x4, [sp, #96]
+        adc     x2, x2, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x5, x19, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x5, x5, x0
+        str     x5, [sp, #128]
+        adc     x3, x3, x1
+        ldr     x7, [sp, #104]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #136]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        str     x2, [sp, #104]
+        adc     x6, x6, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x3, x3, x0
+        str     x3, [sp, #136]
+        adc     x4, x4, x1
+        ldr     x7, [sp, #112]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #144]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        str     x6, [sp, #112]
+        adc     x5, x5, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x4, x4, x0
+        str     x4, [sp, #144]
+        adc     x2, x2, x1
+        ldr     x7, [sp, #120]
+        eor     x1, x7, x14
+        and     x3, x14, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #152]
+        eor     x1, x8, x15
+        and     x0, x15, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldp     x0, x1, [sp, #96]
+        ldr     x6, [sp, #112]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x6, x6, x11
+        mov     x10, #0x2000000000000000
+        adcs    x5, x5, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x3, x3, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x6, x6, x10
+        adcs    x5, x5, x14
+        adcs    x3, x3, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x6, x6, x11
+        sbcs    x5, x5, xzr
+        sbc     x3, x3, x10
+        stp     x1, x6, [sp, #96]
+        stp     x5, x3, [sp, #112]
+        eor     x1, x7, x16
+        and     x5, x16, x12
+        neg     x5, x5
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, x17
+        and     x0, x17, x13
+        sub     x5, x5, x0
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        ldp     x0, x1, [sp, #128]
+        ldr     x3, [sp, #144]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x3, x3, x11
+        mov     x10, #0x2000000000000000
+        adcs    x2, x2, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x5, x5, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x3, x3, x10
+        adcs    x2, x2, x14
+        adcs    x5, x5, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x3, x3, x11
+        sbcs    x2, x2, xzr
+        sbc     x5, x5, x10
+        stp     x1, x3, [sp, #128]
+        stp     x2, x5, [sp, #144]
+p256_scalarmul_alt_inv_midloop:
+        mov     x1, x22
+        ldr     x2, [sp]
+        ldr     x3, [sp, #48]
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x8, x4, #0x100, lsl #12
+        sbfx    x8, x8, #21, #21
+        mov     x11, #0x100000
+        add     x11, x11, x11, lsl #21
+        add     x9, x4, x11
+        asr     x9, x9, #42
+        add     x10, x5, #0x100, lsl #12
+        sbfx    x10, x10, #21, #21
+        add     x11, x5, x11
+        asr     x11, x11, #42
+        mul     x6, x8, x2
+        mul     x7, x9, x3
+        mul     x2, x10, x2
+        mul     x3, x11, x3
+        add     x4, x6, x7
+        add     x5, x2, x3
+        asr     x2, x4, #20
+        asr     x3, x5, #20
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x12, x4, #0x100, lsl #12
+        sbfx    x12, x12, #21, #21
+        mov     x15, #0x100000
+        add     x15, x15, x15, lsl #21
+        add     x13, x4, x15
+        asr     x13, x13, #42
+        add     x14, x5, #0x100, lsl #12
+        sbfx    x14, x14, #21, #21
+        add     x15, x5, x15
+        asr     x15, x15, #42
+        mul     x6, x12, x2
+        mul     x7, x13, x3
+        mul     x2, x14, x2
+        mul     x3, x15, x3
+        add     x4, x6, x7
+        add     x5, x2, x3
+        asr     x2, x4, #20
+        asr     x3, x5, #20
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        mul     x2, x12, x8
+        mul     x3, x12, x9
+        mul     x6, x14, x8
+        mul     x7, x14, x9
+        madd    x8, x13, x10, x2
+        madd    x9, x13, x11, x3
+        madd    x16, x15, x10, x6
+        madd    x17, x15, x11, x7
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x12, x4, #0x100, lsl #12
+        sbfx    x12, x12, #22, #21
+        mov     x15, #0x100000
+        add     x15, x15, x15, lsl #21
+        add     x13, x4, x15
+        asr     x13, x13, #43
+        add     x14, x5, #0x100, lsl #12
+        sbfx    x14, x14, #22, #21
+        add     x15, x5, x15
+        asr     x15, x15, #43
+        mneg    x2, x12, x8
+        mneg    x3, x12, x9
+        mneg    x4, x14, x8
+        mneg    x5, x14, x9
+        msub    x10, x13, x16, x2
+        msub    x11, x13, x17, x3
+        msub    x12, x15, x16, x4
+        msub    x13, x15, x17, x5
+        mov     x22, x1
+        subs    x21, x21, #0x1
+        b.ne    p256_scalarmul_alt_inv_loop
+        ldr     x0, [sp]
+        ldr     x1, [sp, #48]
+        mul     x0, x0, x10
+        madd    x1, x1, x11, x0
+        asr     x0, x1, #63
+        cmp     x10, xzr
+        csetm   x14, mi
+        cneg    x10, x10, mi
+        eor     x14, x14, x0
+        cmp     x11, xzr
+        csetm   x15, mi
+        cneg    x11, x11, mi
+        eor     x15, x15, x0
+        cmp     x12, xzr
+        csetm   x16, mi
+        cneg    x12, x12, mi
+        eor     x16, x16, x0
+        cmp     x13, xzr
+        csetm   x17, mi
+        cneg    x13, x13, mi
+        eor     x17, x17, x0
+        and     x0, x10, x14
+        and     x1, x11, x15
+        add     x9, x0, x1
+        ldr     x7, [sp, #96]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #128]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        str     x4, [sp, #96]
+        adc     x2, x2, x1
+        ldr     x7, [sp, #104]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #136]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        str     x2, [sp, #104]
+        adc     x6, x6, x1
+        ldr     x7, [sp, #112]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #144]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        str     x6, [sp, #112]
+        adc     x5, x5, x1
+        ldr     x7, [sp, #120]
+        eor     x1, x7, x14
+        and     x3, x14, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #152]
+        eor     x1, x8, x15
+        and     x0, x15, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldp     x0, x1, [sp, #96]
+        ldr     x2, [sp, #112]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x2, x2, x11
+        mov     x10, #0x2000000000000000
+        adcs    x5, x5, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x3, x3, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x2, x2, x10
+        adcs    x5, x5, x14
+        adcs    x3, x3, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x2, x2, x11
+        sbcs    x5, x5, xzr
+        sbc     x3, x3, x10
+        mov     x10, #0xffffffffffffffff
+        subs    x10, x1, x10
+        mov     x11, #0xffffffff
+        sbcs    x11, x2, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x5, xzr
+        sbcs    x13, x3, x13
+        csel    x10, x1, x10, cc
+        csel    x11, x2, x11, cc
+        csel    x12, x5, x12, cc
+        csel    x13, x3, x13, cc
+        stp     x10, x11, [x20]
+        stp     x12, x13, [x20, #16]
+        add     sp, sp, #0xa0
+        ldp     x23, x24, [sp], #16
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+        ret
+
+p256_scalarmul_alt_local_montmul_p256:
+        ldp     x3, x4, [x1]
+        ldp     x7, x8, [x2]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x2, #16]
+        mul     x11, x3, x9
+        umulh   x15, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x16, x3, x10
+        adcs    x15, x15, x11
+        adc     x16, x16, xzr
+        ldp     x5, x6, [x1, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x15, x15, x11
+        mul     x11, x4, x10
+        adcs    x16, x16, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x15, x15, x11
+        umulh   x11, x4, x9
+        adcs    x16, x16, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x15, x15, x11
+        mul     x11, x5, x9
+        adcs    x16, x16, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x15, x15, x11
+        umulh   x11, x5, x8
+        adcs    x16, x16, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x15, x15, x11
+        mul     x11, x6, x8
+        adcs    x16, x16, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x15, x15, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x16, x16, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x15, x15, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x15, x15, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x15, lsl #32
+        lsr     x11, x15, #32
+        adcs    x13, x13, x11
+        mul     x11, x15, x10
+        umulh   x15, x15, x10
+        adcs    x14, x14, x11
+        adc     x15, x15, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x15, x15, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x16, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x15, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x16, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x15, x15, x5, cc
+        stp     x12, x13, [x0]
+        stp     x14, x15, [x0, #16]
+        ret
+
+p256_scalarmul_alt_local_montsqr_p256:
+        ldp     x2, x3, [x1]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x1, #16]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        mov     x5, #0xffffffff00000001
+        adds    x9, x9, x8, lsl #32
+        lsr     x2, x8, #32
+        adcs    x10, x10, x2
+        mul     x2, x8, x5
+        umulh   x8, x8, x5
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x2, x9, #32
+        adcs    x11, x11, x2
+        mul     x2, x9, x5
+        umulh   x9, x9, x5
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x2, x10, #32
+        adcs    x8, x8, x2
+        mul     x2, x10, x5
+        umulh   x10, x10, x5
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x2, x11, #32
+        adcs    x9, x9, x2
+        mul     x2, x11, x5
+        umulh   x11, x11, x5
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [x0]
+        stp     x10, x11, [x0, #16]
+        ret
+
+p256_scalarmul_alt_local_tomont_p256:
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        mov     x1, #0xffffffffffffffff
+        mov     x7, #0xffffffff
+        mov     x9, #0xffffffff00000001
+        subs    x1, x2, x1
+        sbcs    x7, x3, x7
+        sbcs    x8, x4, xzr
+        sbcs    x9, x5, x9
+        csel    x2, x2, x1, cc
+        csel    x3, x3, x7, cc
+        csel    x4, x4, x8, cc
+        csel    x5, x5, x9, cc
+        cmp     xzr, xzr
+        extr    x9, x5, x4, #32
+        adcs    xzr, x4, x9
+        lsr     x9, x5, #32
+        adcs    x9, x5, x9
+        csetm   x6, cs
+        orr     x9, x9, x6
+        lsl     x7, x9, #32
+        lsr     x8, x9, #32
+        adds    x4, x4, x7
+        adc     x5, x5, x8
+        negs    x6, x9
+        sbcs    x7, x7, xzr
+        sbc     x8, x8, xzr
+        negs    x6, x6
+        sbcs    x2, x2, x7
+        sbcs    x3, x3, x8
+        sbcs    x4, x4, x9
+        sbcs    x5, x5, x9
+        adds    x6, x6, x5
+        mov     x7, #0xffffffff
+        and     x7, x7, x5
+        adcs    x2, x2, x7
+        adcs    x3, x3, xzr
+        mov     x7, #0xffffffff00000001
+        and     x7, x7, x5
+        adc     x4, x4, x7
+        cmp     xzr, xzr
+        extr    x9, x4, x3, #32
+        adcs    xzr, x3, x9
+        lsr     x9, x4, #32
+        adcs    x9, x4, x9
+        csetm   x5, cs
+        orr     x9, x9, x5
+        lsl     x7, x9, #32
+        lsr     x8, x9, #32
+        adds    x3, x3, x7
+        adc     x4, x4, x8
+        negs    x5, x9
+        sbcs    x7, x7, xzr
+        sbc     x8, x8, xzr
+        negs    x5, x5
+        sbcs    x6, x6, x7
+        sbcs    x2, x2, x8
+        sbcs    x3, x3, x9
+        sbcs    x4, x4, x9
+        adds    x5, x5, x4
+        mov     x7, #0xffffffff
+        and     x7, x7, x4
+        adcs    x6, x6, x7
+        adcs    x2, x2, xzr
+        mov     x7, #0xffffffff00000001
+        and     x7, x7, x4
+        adc     x3, x3, x7
+        cmp     xzr, xzr
+        extr    x9, x3, x2, #32
+        adcs    xzr, x2, x9
+        lsr     x9, x3, #32
+        adcs    x9, x3, x9
+        csetm   x4, cs
+        orr     x9, x9, x4
+        lsl     x7, x9, #32
+        lsr     x8, x9, #32
+        adds    x2, x2, x7
+        adc     x3, x3, x8
+        negs    x4, x9
+        sbcs    x7, x7, xzr
+        sbc     x8, x8, xzr
+        negs    x4, x4
+        sbcs    x5, x5, x7
+        sbcs    x6, x6, x8
+        sbcs    x2, x2, x9
+        sbcs    x3, x3, x9
+        adds    x4, x4, x3
+        mov     x7, #0xffffffff
+        and     x7, x7, x3
+        adcs    x5, x5, x7
+        adcs    x6, x6, xzr
+        mov     x7, #0xffffffff00000001
+        and     x7, x7, x3
+        adc     x2, x2, x7
+        cmp     xzr, xzr
+        extr    x9, x2, x6, #32
+        adcs    xzr, x6, x9
+        lsr     x9, x2, #32
+        adcs    x9, x2, x9
+        csetm   x3, cs
+        orr     x9, x9, x3
+        lsl     x7, x9, #32
+        lsr     x8, x9, #32
+        adds    x6, x6, x7
+        adc     x2, x2, x8
+        negs    x3, x9
+        sbcs    x7, x7, xzr
+        sbc     x8, x8, xzr
+        negs    x3, x3
+        sbcs    x4, x4, x7
+        sbcs    x5, x5, x8
+        sbcs    x6, x6, x9
+        sbcs    x2, x2, x9
+        adds    x3, x3, x2
+        mov     x7, #0xffffffff
+        and     x7, x7, x2
+        adcs    x4, x4, x7
+        adcs    x5, x5, xzr
+        mov     x7, #0xffffffff00000001
+        and     x7, x7, x2
+        adc     x6, x6, x7
+        stp     x3, x4, [x0]
+        stp     x5, x6, [x0, #16]
+        ret
+
+p256_scalarmul_alt_local_p256_montjadd:
+        sub     sp, sp, #0xe0
+        mov     x15, x0
+        mov     x16, x1
+        mov     x17, x2
+        ldp     x2, x3, [x16, #64]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x16, #80]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        mov     x2, #0xffffffffffffffff
+        csel    x2, xzr, x2, cc
+        mov     x3, #0xffffffff
+        csel    x3, xzr, x3, cc
+        mov     x5, #0xffffffff00000001
+        csel    x5, xzr, x5, cc
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, xzr
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #16]
+        ldp     x2, x3, [x17, #64]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x17, #80]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        mov     x2, #0xffffffffffffffff
+        csel    x2, xzr, x2, cc
+        mov     x3, #0xffffffff
+        csel    x3, xzr, x3, cc
+        mov     x5, #0xffffffff00000001
+        csel    x5, xzr, x5, cc
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, xzr
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp, #160]
+        stp     x10, x11, [sp, #176]
+        ldp     x3, x4, [x17, #64]
+        ldp     x7, x8, [x16, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [x17, #80]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #192]
+        stp     x14, x0, [sp, #208]
+        ldp     x3, x4, [x16, #64]
+        ldp     x7, x8, [x17, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [x16, #80]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #32]
+        stp     x14, x0, [sp, #48]
+        ldp     x3, x4, [sp]
+        ldp     x7, x8, [x17]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #16]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #64]
+        stp     x14, x0, [sp, #80]
+        ldp     x3, x4, [sp, #160]
+        ldp     x7, x8, [x16]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #16]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #176]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #128]
+        stp     x14, x0, [sp, #144]
+        ldp     x3, x4, [sp]
+        ldp     x7, x8, [sp, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #32]
+        stp     x14, x0, [sp, #48]
+        ldp     x3, x4, [sp, #160]
+        ldp     x7, x8, [sp, #192]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #208]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #176]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #192]
+        stp     x14, x0, [sp, #208]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #160]
+        stp     x7, x8, [sp, #176]
+        ldp     x5, x6, [sp, #32]
+        ldp     x4, x3, [sp, #192]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #48]
+        ldp     x4, x3, [sp, #208]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #32]
+        stp     x7, x8, [sp, #48]
+        ldp     x2, x3, [sp, #160]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #176]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        mov     x2, #0xffffffffffffffff
+        csel    x2, xzr, x2, cc
+        mov     x3, #0xffffffff
+        csel    x3, xzr, x3, cc
+        mov     x5, #0xffffffff00000001
+        csel    x5, xzr, x5, cc
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, xzr
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp, #96]
+        stp     x10, x11, [sp, #112]
+        ldp     x2, x3, [sp, #32]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #48]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        mov     x5, #0xffffffff00000001
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #16]
+        ldp     x3, x4, [sp, #96]
+        ldp     x7, x8, [sp, #128]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #144]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #112]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #128]
+        stp     x14, x0, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x7, x8, [sp, #64]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #80]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #112]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #64]
+        stp     x14, x0, [sp, #80]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #96]
+        stp     x7, x8, [sp, #112]
+        ldp     x3, x4, [sp, #160]
+        ldp     x7, x8, [x16, #64]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #80]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #176]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #160]
+        stp     x14, x0, [sp, #176]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #64]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #80]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x7, x8, [sp, #192]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #208]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #112]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #96]
+        stp     x14, x0, [sp, #112]
+        ldp     x3, x4, [sp, #160]
+        ldp     x7, x8, [x17, #64]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #80]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #176]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #160]
+        stp     x14, x0, [sp, #176]
+        ldp     x3, x4, [sp, #32]
+        ldp     x7, x8, [sp, #128]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #144]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #48]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #128]
+        stp     x14, x0, [sp, #144]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp, #96]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #112]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x0, x1, [x16, #64]
+        ldp     x2, x3, [x16, #80]
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne
+        ldp     x4, x5, [x17, #64]
+        ldp     x6, x7, [x17, #80]
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne
+        cmp     x13, x12
+        ldp     x8, x9, [sp, #160]
+        csel    x8, x0, x8, cc
+        csel    x9, x1, x9, cc
+        csel    x8, x4, x8, hi
+        csel    x9, x5, x9, hi
+        ldp     x10, x11, [sp, #176]
+        csel    x10, x2, x10, cc
+        csel    x11, x3, x11, cc
+        csel    x10, x6, x10, hi
+        csel    x11, x7, x11, hi
+        ldp     x12, x13, [x16]
+        ldp     x0, x1, [sp]
+        csel    x0, x12, x0, cc
+        csel    x1, x13, x1, cc
+        ldp     x12, x13, [x17]
+        csel    x0, x12, x0, hi
+        csel    x1, x13, x1, hi
+        ldp     x12, x13, [x16, #16]
+        ldp     x2, x3, [sp, #16]
+        csel    x2, x12, x2, cc
+        csel    x3, x13, x3, cc
+        ldp     x12, x13, [x17, #16]
+        csel    x2, x12, x2, hi
+        csel    x3, x13, x3, hi
+        ldp     x12, x13, [x16, #32]
+        ldp     x4, x5, [sp, #128]
+        csel    x4, x12, x4, cc
+        csel    x5, x13, x5, cc
+        ldp     x12, x13, [x17, #32]
+        csel    x4, x12, x4, hi
+        csel    x5, x13, x5, hi
+        ldp     x12, x13, [x16, #48]
+        ldp     x6, x7, [sp, #144]
+        csel    x6, x12, x6, cc
+        csel    x7, x13, x7, cc
+        ldp     x12, x13, [x17, #48]
+        csel    x6, x12, x6, hi
+        csel    x7, x13, x7, hi
+        stp     x0, x1, [x15]
+        stp     x2, x3, [x15, #16]
+        stp     x4, x5, [x15, #32]
+        stp     x6, x7, [x15, #48]
+        stp     x8, x9, [x15, #64]
+        stp     x10, x11, [x15, #80]
+        add     sp, sp, #0xe0
+        ret
+
+p256_scalarmul_alt_local_p256_montjdouble:
+        sub     sp, sp, #0xc0
+        mov     x15, x0
+        mov     x16, x1
+        ldp     x2, x3, [x16, #64]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x16, #80]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        mov     x5, #0xffffffff00000001
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mul     x2, x8, x5
+        umulh   x8, x8, x5
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mul     x2, x9, x5
+        umulh   x9, x9, x5
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mul     x2, x10, x5
+        umulh   x10, x10, x5
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mul     x2, x11, x5
+        umulh   x11, x11, x5
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #16]
+        ldp     x2, x3, [x16, #32]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x16, #48]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        mov     x5, #0xffffffff00000001
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mul     x2, x8, x5
+        umulh   x8, x8, x5
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mul     x2, x9, x5
+        umulh   x9, x9, x5
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mul     x2, x10, x5
+        umulh   x10, x10, x5
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mul     x2, x11, x5
+        umulh   x11, x11, x5
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [sp, #32]
+        stp     x10, x11, [sp, #48]
+        ldp     x5, x6, [x16]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x16, #16]
+        ldp     x4, x3, [sp, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #96]
+        stp     x7, x8, [sp, #112]
+        ldp     x5, x6, [x16]
+        ldp     x4, x3, [sp]
+        adds    x5, x5, x4
+        adcs    x6, x6, x3
+        ldp     x7, x8, [x16, #16]
+        ldp     x4, x3, [sp, #16]
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        csetm   x3, cs
+        subs    x5, x5, x3
+        and     x1, x3, #0xffffffff
+        sbcs    x6, x6, x1
+        sbcs    x7, x7, xzr
+        and     x2, x3, #0xffffffff00000001
+        sbc     x8, x8, x2
+        stp     x5, x6, [sp, #64]
+        stp     x7, x8, [sp, #80]
+        ldp     x3, x4, [sp, #64]
+        ldp     x7, x8, [sp, #96]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #112]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #80]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #96]
+        stp     x14, x0, [sp, #112]
+        ldp     x5, x6, [x16, #32]
+        ldp     x4, x3, [x16, #64]
+        adds    x5, x5, x4
+        adcs    x6, x6, x3
+        ldp     x7, x8, [x16, #48]
+        ldp     x4, x3, [x16, #80]
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adc     x3, xzr, xzr
+        cmn     x5, #0x1
+        mov     x4, #0xffffffff
+        sbcs    xzr, x6, x4
+        sbcs    xzr, x7, xzr
+        mov     x4, #0xffffffff00000001
+        sbcs    xzr, x8, x4
+        adcs    x3, x3, xzr
+        csetm   x3, ne
+        subs    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        sbcs    x6, x6, x4
+        sbcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        sbc     x8, x8, x4
+        stp     x5, x6, [sp, #64]
+        stp     x7, x8, [sp, #80]
+        ldp     x3, x4, [x16]
+        ldp     x7, x8, [sp, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [x16, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #128]
+        stp     x14, x0, [sp, #144]
+        ldp     x2, x3, [sp, #96]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #112]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        mov     x5, #0xffffffff00000001
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mul     x2, x8, x5
+        umulh   x8, x8, x5
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mul     x2, x9, x5
+        umulh   x9, x9, x5
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mul     x2, x10, x5
+        umulh   x10, x10, x5
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mul     x2, x11, x5
+        umulh   x11, x11, x5
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [sp, #160]
+        stp     x10, x11, [sp, #176]
+        ldp     x2, x3, [sp, #64]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #80]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        mov     x5, #0xffffffff00000001
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mul     x2, x8, x5
+        umulh   x8, x8, x5
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mul     x2, x9, x5
+        umulh   x9, x9, x5
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mul     x2, x10, x5
+        umulh   x10, x10, x5
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mul     x2, x11, x5
+        umulh   x11, x11, x5
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [sp, #64]
+        stp     x10, x11, [sp, #80]
+        mov     x1, #0x9
+        mov     x2, #0xffffffffffffffff
+        ldp     x9, x10, [sp, #160]
+        subs    x9, x2, x9
+        mov     x2, #0xffffffff
+        sbcs    x10, x2, x10
+        ldp     x11, x12, [sp, #176]
+        ngcs    x11, x11
+        mov     x2, #0xffffffff00000001
+        sbc     x12, x2, x12
+        mul     x3, x1, x9
+        mul     x4, x1, x10
+        mul     x5, x1, x11
+        mul     x6, x1, x12
+        umulh   x9, x1, x9
+        umulh   x10, x1, x10
+        umulh   x11, x1, x11
+        umulh   x7, x1, x12
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, xzr
+        mov     x1, #0xc
+        ldp     x9, x10, [sp, #128]
+        mul     x8, x9, x1
+        umulh   x9, x9, x1
+        adds    x3, x3, x8
+        mul     x8, x10, x1
+        umulh   x10, x10, x1
+        adcs    x4, x4, x8
+        ldp     x11, x12, [sp, #144]
+        mul     x8, x11, x1
+        umulh   x11, x11, x1
+        adcs    x5, x5, x8
+        mul     x8, x12, x1
+        umulh   x12, x12, x1
+        adcs    x6, x6, x8
+        adc     x7, x7, xzr
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, x12
+        add     x8, x7, #0x1
+        lsl     x10, x8, #32
+        adds    x6, x6, x10
+        adc     x7, x7, xzr
+        neg     x9, x8
+        sub     x10, x10, #0x1
+        subs    x3, x3, x9
+        sbcs    x4, x4, x10
+        sbcs    x5, x5, xzr
+        sbcs    x6, x6, x8
+        sbc     x8, x7, x8
+        adds    x3, x3, x8
+        and     x9, x8, #0xffffffff
+        adcs    x4, x4, x9
+        adcs    x5, x5, xzr
+        neg     x10, x9
+        adc     x6, x6, x10
+        stp     x3, x4, [sp, #160]
+        stp     x5, x6, [sp, #176]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #64]
+        stp     x7, x8, [sp, #80]
+        ldp     x2, x3, [sp, #32]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #48]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        mov     x5, #0xffffffff00000001
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mul     x2, x8, x5
+        umulh   x8, x8, x5
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mul     x2, x9, x5
+        umulh   x9, x9, x5
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mul     x2, x10, x5
+        umulh   x10, x10, x5
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mul     x2, x11, x5
+        umulh   x11, x11, x5
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #16]
+        ldp     x3, x4, [sp, #160]
+        ldp     x7, x8, [sp, #96]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #112]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #176]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #96]
+        stp     x14, x0, [sp, #112]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp, #32]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #48]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x5, x6, [x15, #64]
+        stp     x7, x8, [x15, #80]
+        ldp     x1, x2, [sp, #128]
+        lsl     x0, x1, #2
+        ldp     x6, x7, [sp, #160]
+        subs    x0, x0, x6
+        extr    x1, x2, x1, #62
+        sbcs    x1, x1, x7
+        ldp     x3, x4, [sp, #144]
+        extr    x2, x3, x2, #62
+        ldp     x6, x7, [sp, #176]
+        sbcs    x2, x2, x6
+        extr    x3, x4, x3, #62
+        sbcs    x3, x3, x7
+        lsr     x4, x4, #62
+        sbc     x4, x4, xzr
+        add     x5, x4, #0x1
+        lsl     x8, x5, #32
+        negs    x6, x8
+        ngcs    x7, xzr
+        sbc     x8, x8, x5
+        adds    x0, x0, x5
+        adcs    x1, x1, x6
+        adcs    x2, x2, x7
+        adcs    x3, x3, x8
+        csetm   x5, cc
+        adds    x0, x0, x5
+        and     x6, x5, #0xffffffff
+        adcs    x1, x1, x6
+        adcs    x2, x2, xzr
+        neg     x7, x6
+        adc     x3, x3, x7
+        stp     x0, x1, [x15]
+        stp     x2, x3, [x15, #16]
+        mov     x1, #0x8
+        mov     x2, #0xffffffffffffffff
+        ldp     x9, x10, [sp]
+        subs    x9, x2, x9
+        mov     x2, #0xffffffff
+        sbcs    x10, x2, x10
+        ldp     x11, x12, [sp, #16]
+        ngcs    x11, x11
+        mov     x2, #0xffffffff00000001
+        sbc     x12, x2, x12
+        lsl     x3, x9, #3
+        extr    x4, x10, x9, #61
+        extr    x5, x11, x10, #61
+        extr    x6, x12, x11, #61
+        lsr     x7, x12, #61
+        mov     x1, #0x3
+        ldp     x9, x10, [sp, #96]
+        mul     x8, x9, x1
+        umulh   x9, x9, x1
+        adds    x3, x3, x8
+        mul     x8, x10, x1
+        umulh   x10, x10, x1
+        adcs    x4, x4, x8
+        ldp     x11, x12, [sp, #112]
+        mul     x8, x11, x1
+        umulh   x11, x11, x1
+        adcs    x5, x5, x8
+        mul     x8, x12, x1
+        umulh   x12, x12, x1
+        adcs    x6, x6, x8
+        adc     x7, x7, xzr
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, x12
+        add     x8, x7, #0x1
+        lsl     x10, x8, #32
+        adds    x6, x6, x10
+        adc     x7, x7, xzr
+        neg     x9, x8
+        sub     x10, x10, #0x1
+        subs    x3, x3, x9
+        sbcs    x4, x4, x10
+        sbcs    x5, x5, xzr
+        sbcs    x6, x6, x8
+        sbc     x8, x7, x8
+        adds    x3, x3, x8
+        and     x9, x8, #0xffffffff
+        adcs    x4, x4, x9
+        adcs    x5, x5, xzr
+        neg     x10, x9
+        adc     x6, x6, x10
+        stp     x3, x4, [x15, #32]
+        stp     x5, x6, [x15, #48]
+        add     sp, sp, #0xc0
+        ret
+
+p256_scalarmul_alt_local_p256_montjmixadd:
+        sub     sp, sp, #0xc0
+        mov     x15, x0
+        mov     x16, x1
+        mov     x17, x2
+        ldp     x2, x3, [x16, #64]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x16, #80]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        mov     x2, #0xffffffffffffffff
+        csel    x2, xzr, x2, cc
+        mov     x3, #0xffffffff
+        csel    x3, xzr, x3, cc
+        mov     x5, #0xffffffff00000001
+        csel    x5, xzr, x5, cc
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, xzr
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #16]
+        ldp     x3, x4, [x16, #64]
+        ldp     x7, x8, [x17, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [x16, #80]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #32]
+        stp     x14, x0, [sp, #48]
+        ldp     x3, x4, [sp]
+        ldp     x7, x8, [x17]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #16]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #64]
+        stp     x14, x0, [sp, #80]
+        ldp     x3, x4, [sp]
+        ldp     x7, x8, [sp, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #32]
+        stp     x14, x0, [sp, #48]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [x16]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [x16, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #160]
+        stp     x7, x8, [sp, #176]
+        ldp     x5, x6, [sp, #32]
+        ldp     x4, x3, [x16, #32]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #48]
+        ldp     x4, x3, [x16, #48]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #32]
+        stp     x7, x8, [sp, #48]
+        ldp     x2, x3, [sp, #160]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #176]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        mov     x2, #0xffffffffffffffff
+        csel    x2, xzr, x2, cc
+        mov     x3, #0xffffffff
+        csel    x3, xzr, x3, cc
+        mov     x5, #0xffffffff00000001
+        csel    x5, xzr, x5, cc
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, xzr
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp, #96]
+        stp     x10, x11, [sp, #112]
+        ldp     x2, x3, [sp, #32]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #48]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        mov     x5, #0xffffffff00000001
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #16]
+        ldp     x3, x4, [sp, #96]
+        ldp     x7, x8, [x16]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #16]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #112]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #128]
+        stp     x14, x0, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x7, x8, [sp, #64]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #80]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #112]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #64]
+        stp     x14, x0, [sp, #80]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #96]
+        stp     x7, x8, [sp, #112]
+        ldp     x3, x4, [sp, #160]
+        ldp     x7, x8, [x16, #64]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #80]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #176]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #160]
+        stp     x14, x0, [sp, #176]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #64]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #80]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x7, x8, [x16, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #112]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #96]
+        stp     x14, x0, [sp, #112]
+        ldp     x3, x4, [sp, #32]
+        ldp     x7, x8, [sp, #128]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #144]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #48]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #128]
+        stp     x14, x0, [sp, #144]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp, #96]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #112]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x0, x1, [x16, #64]
+        ldp     x2, x3, [x16, #80]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+        ldp     x0, x1, [sp]
+        ldp     x12, x13, [x17]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [sp, #16]
+        ldp     x12, x13, [x17, #16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+        ldp     x4, x5, [sp, #128]
+        ldp     x12, x13, [x17, #32]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [sp, #144]
+        ldp     x12, x13, [x17, #48]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+        ldp     x8, x9, [sp, #160]
+        mov     x12, #0x1
+        mov     x13, #0xffffffff00000000
+        csel    x8, x8, x12, ne
+        csel    x9, x9, x13, ne
+        ldp     x10, x11, [sp, #176]
+        mov     x12, #0xffffffffffffffff
+        mov     x13, #0xfffffffe
+        csel    x10, x10, x12, ne
+        csel    x11, x11, x13, ne
+        stp     x0, x1, [x15]
+        stp     x2, x3, [x15, #16]
+        stp     x4, x5, [x15, #32]
+        stp     x6, x7, [x15, #48]
+        stp     x8, x9, [x15, #64]
+        stp     x10, x11, [x15, #80]
+        add     sp, sp, #0xc0
+        ret
+
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase.S
new file mode 100644
index 00000000000..29b6c9ed892
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase.S
@@ -0,0 +1,3751 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Scalar multiplication for precomputed point on NIST curve P-256
+// Input scalar[4], blocksize, table[]; output res[8]
+//
+// extern void p256_scalarmulbase
+//   (uint64_t res[static 8],
+//    uint64_t scalar[static 4],
+//    uint64_t blocksize,
+//    uint64_t *table);
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve P-256, the input argument "table" is expected to be a table of
+// multiples of the point P in Montgomery-affine form, with each block
+// corresponding to "blocksize" bits of the scalar as follows, where
+// B = 2^{blocksize-1} (e.g. B = 8 for blocksize = 4):
+//
+// For each i,j with blocksize * i <= 256 and 1 <= j <= B
+// the multiple 2^{blocksize * i} * j * P is stored at
+// tab[8 * (B * i + (j - 1))], considered as uint64_t pointers
+// or tab + 64 * (B * i + (j - 1)) as byte pointers.
+//
+// Standard ARM ABI: X0 = res, X1 = scalar, X2 = blocksize, X3 = table
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmulbase)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmulbase)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Safe copies of inputs and additional variables, with some aliasing
+
+#define res x19
+#define blocksize x20
+#define table x21
+#define i x22
+#define bf x23
+#define cf x24
+#define j x25
+
+// Intermediate variables on the stack. The last z2, z3 values can
+// safely be overlaid on "nacc", which is no longer needed at the end.
+// Uppercase syntactic variants make x86_att version simpler to generate
+
+#define rscalar sp, #(0*NUMSIZE)
+#define acc sp, #(1*NUMSIZE)
+#define nacc sp, #(4*NUMSIZE)
+#define tabent sp, #(7*NUMSIZE)
+
+#define z2 sp, #(4*NUMSIZE)
+#define z3 sp, #(5*NUMSIZE)
+
+#define NSPACE #(9*NUMSIZE)
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(p256_scalarmulbase):
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x30, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Preserve the input arguments except the scalar, since that gets absorbed
+// immediately. The "table" value subsequently gets shifted up each iteration
+// of the loop, while "res" and "blocksize" are static throughout.
+
+        mov     res, x0
+        mov     blocksize, x2
+        mov     table, x3
+
+// Load the digits of group order n_256 = [x15;x14;x13;x12]
+
+        movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551)
+        movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84)
+        mov     x14, #0xffffffffffffffff
+        mov     x15, #0xffffffff00000000
+
+// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256
+// Store it to "rscalar" (reduced scalar)
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+
+        subs    x6, x2, x12
+        sbcs    x7, x3, x13
+        sbcs    x8, x4, x14
+        sbcs    x9, x5, x15
+
+        csel    x2, x2, x6, cc
+        csel    x3, x3, x7, cc
+        csel    x4, x4, x8, cc
+        csel    x5, x5, x9, cc
+
+        stp     x2, x3, [rscalar]
+        stp     x4, x5, [rscalar+16]
+
+// Initialize the accumulator to all zeros and the "carry flag" cf to 0
+
+        stp     xzr, xzr, [acc]
+        stp     xzr, xzr, [acc+16]
+        stp     xzr, xzr, [acc+32]
+        stp     xzr, xzr, [acc+48]
+        stp     xzr, xzr, [acc+64]
+        stp     xzr, xzr, [acc+80]
+        mov     cf, xzr
+
+// Main loop over {i >= 0 | blocksize * i <= 256}. Note the non-strict
+// inequality, to allow top carry for any choices of blocksize.
+
+        mov     i, xzr
+
+p256_scalarmulbase_loop:
+
+// The next raw bitfield is bf = bitfield(blocksize * i,blocksize) + cf,
+// adding in the deferred carry cf. We then shift the whole scalar right
+// by blocksize so we can keep picking bitfield(0,blocksize).
+
+        ldp     x0, x1, [rscalar]
+        ldp     x2, x3, [rscalar+16]
+
+        mov     x4, #1
+        lsl     x4, x4, blocksize
+        sub     x4, x4, #1
+        and     x4, x4, x0
+        add     bf, x4, cf
+
+        neg     x8, blocksize
+
+        lsl     x5, x1, x8
+
+        lsr     x0, x0, blocksize
+        orr     x0, x0,  x5
+
+        lsl     x6, x2, x8
+        lsr     x1, x1, blocksize
+        orr     x1, x1, x6
+
+        lsl     x7, x3, x8
+        lsr     x2, x2, blocksize
+        orr     x2, x2, x7
+
+        lsr     x3, x3, blocksize
+
+        stp     x0, x1, [rscalar]
+        stp     x2, x3, [rscalar+16]
+
+// Now if bf <= B we just select entry j, unnegated and set cf = 0.
+// If bf > B we set j = 2 * B - bf and negate the j'th entry, setting cf = 1.
+// In either case we ultimately add bf, in the latter case with deferred
+// carry as 2 * B - (2 * B - bf) = bf.
+
+        mov     x0, #1
+        lsl     x1, x0, blocksize
+        lsr     x0, x1, #1
+
+        sub     x2, x1, bf
+
+        cmp     x0, bf
+        cset    cf, cc
+        csel    j, x2, bf, cc
+
+// Load table entry j - 1 for nonzero j in constant-time style.
+
+        mov     x16, #1
+        lsl     x16, x16, blocksize
+        lsr     x16, x16, #1
+        mov     x17, j
+
+p256_scalarmulbase_tabloop:
+        ldp     x8, x9, [table]
+        ldp     x10, x11, [table, #16]
+        ldp     x12, x13, [table, #32]
+        ldp     x14, x15, [table, #48]
+
+        subs    x17, x17, #1
+        csel    x0, x8, x0, eq
+        csel    x1, x9, x1, eq
+        csel    x2, x10, x2, eq
+        csel    x3, x11, x3, eq
+        csel    x4, x12, x4, eq
+        csel    x5, x13, x5, eq
+        csel    x6, x14, x6, eq
+        csel    x7, x15, x7, eq
+
+        add     table, table, #64
+
+        sub     x16, x16, #1
+        cbnz    x16, p256_scalarmulbase_tabloop
+
+// Before storing back, optionally negate the y coordinate of the table entry
+
+        stp     x0, x1, [tabent]
+        stp     x2, x3, [tabent+16]
+
+        mov     x0, 0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, 0x00000000ffffffff
+        sbcs    x1, x1, x5
+        mov     x3, 0xffffffff00000001
+        sbcs    x2, xzr, x6
+        sbc     x3, x3, x7
+
+        cmp     cf, xzr
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tabent+32]
+        stp     x6, x7, [tabent+48]
+
+// Add the adjusted table point to the accumulator
+
+        add     x0, nacc
+        add     x1, acc
+        add     x2, tabent
+        bl      p256_scalarmulbase_local_p256_montjmixadd
+
+// However, only commit that update to the accumulator if j is nonzero,
+// because the mixed addition function does not handle this case directly,
+// and in any case we didn't choose the table entry appropriately.
+
+        cmp     j, xzr
+        ldp     x0, x1, [acc]
+        ldp     x12, x13, [nacc]
+        csel    x0, x12, x0, ne
+        csel    x1, x13, x1, ne
+
+        ldp     x2, x3, [acc+16]
+        ldp     x12, x13, [nacc+16]
+        csel    x2, x12, x2, ne
+        csel    x3, x13, x3, ne
+
+        ldp     x4, x5, [acc+32]
+        ldp     x12, x13, [nacc+32]
+        csel    x4, x12, x4, ne
+        csel    x5, x13, x5, ne
+
+        ldp     x6, x7, [acc+48]
+        ldp     x12, x13, [nacc+48]
+        csel    x6, x12, x6, ne
+        csel    x7, x13, x7, ne
+
+        ldp     x8, x9, [acc+64]
+        ldp     x12, x13, [nacc+64]
+        csel    x8, x12, x8, ne
+        csel    x9, x13, x9, ne
+
+        ldp     x10, x11, [acc+80]
+        ldp     x12, x13, [nacc+80]
+        csel    x10, x12, x10, ne
+        csel    x11, x13, x11, ne
+
+        stp     x0, x1, [acc]
+        stp     x2, x3, [acc+16]
+        stp     x4, x5, [acc+32]
+        stp     x6, x7, [acc+48]
+        stp     x8, x9, [acc+64]
+        stp     x10, x11, [acc+80]
+
+// Loop while blocksize * i <= 256
+
+        add     i, i, #1
+        mul     x0, blocksize, i
+        cmp     x0, #257
+        bcc     p256_scalarmulbase_loop
+
+// That's the end of the main loop, and we just need to translate
+// back from the Jacobian representation to affine. First of all,
+// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form
+
+        add     x0, z2
+        add     x1, acc+64
+        bl      p256_scalarmulbase_local_montsqr_p256
+
+        add     x0, z3
+        add     x1, acc+64
+        add     x2, z2
+        bl      p256_scalarmulbase_local_montmul_p256
+
+        add     x0, z2
+        add     x1, z3
+        bl      p256_scalarmulbase_local_demont_p256
+
+        add     x0, z3
+        add     x1, z2
+        bl      p256_scalarmulbase_local_inv_p256
+
+        add     x0, z2
+        add     x1, acc+64
+        add     x2, z3
+        bl      p256_scalarmulbase_local_montmul_p256
+
+// Convert back from Jacobian (X,Y,Z) |-> (X/Z^2, Y/Z^3)
+
+        mov     x0, res
+        add     x1, acc
+        add     x2, z2
+        bl      p256_scalarmulbase_local_montmul_p256
+
+        add     x0, res, #32
+        add     x1, acc+32
+        add     x2, z3
+        bl      p256_scalarmulbase_local_montmul_p256
+
+// Restore stack and registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x25, x30, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+p256_scalarmulbase_local_demont_p256:
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        lsl     x7, x2, #32
+        subs    x8, x2, x7
+        lsr     x6, x2, #32
+        sbc     x2, x2, x6
+        adds    x3, x3, x7
+        adcs    x4, x4, x6
+        adcs    x5, x5, x8
+        adc     x2, x2, xzr
+        lsl     x7, x3, #32
+        subs    x8, x3, x7
+        lsr     x6, x3, #32
+        sbc     x3, x3, x6
+        adds    x4, x4, x7
+        adcs    x5, x5, x6
+        adcs    x2, x2, x8
+        adc     x3, x3, xzr
+        lsl     x7, x4, #32
+        subs    x8, x4, x7
+        lsr     x6, x4, #32
+        sbc     x4, x4, x6
+        adds    x5, x5, x7
+        adcs    x2, x2, x6
+        adcs    x3, x3, x8
+        adc     x4, x4, xzr
+        lsl     x7, x5, #32
+        subs    x8, x5, x7
+        lsr     x6, x5, #32
+        sbc     x5, x5, x6
+        adds    x2, x2, x7
+        adcs    x3, x3, x6
+        adcs    x4, x4, x8
+        adc     x5, x5, xzr
+        stp     x2, x3, [x0]
+        stp     x4, x5, [x0, #16]
+        ret
+
+p256_scalarmulbase_local_inv_p256:
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        sub     sp, sp, #0xa0
+        mov     x20, x0
+        mov     x10, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x13, #0xffffffff00000001
+        stp     x10, x11, [sp]
+        stp     xzr, x13, [sp, #16]
+        str     xzr, [sp, #32]
+        ldp     x2, x3, [x1]
+        subs    x10, x2, x10
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #16]
+        sbcs    x12, x4, xzr
+        sbcs    x13, x5, x13
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+        stp     x2, x3, [sp, #48]
+        stp     x4, x5, [sp, #64]
+        str     xzr, [sp, #80]
+        stp     xzr, xzr, [sp, #96]
+        stp     xzr, xzr, [sp, #112]
+        mov     x10, #0x4000000000000
+        stp     x10, xzr, [sp, #128]
+        stp     xzr, xzr, [sp, #144]
+        mov     x21, #0xa
+        mov     x22, #0x1
+        b       p256_scalarmulbase_inv_midloop
+p256_scalarmulbase_inv_loop:
+        cmp     x10, xzr
+        csetm   x14, mi
+        cneg    x10, x10, mi
+        cmp     x11, xzr
+        csetm   x15, mi
+        cneg    x11, x11, mi
+        cmp     x12, xzr
+        csetm   x16, mi
+        cneg    x12, x12, mi
+        cmp     x13, xzr
+        csetm   x17, mi
+        cneg    x13, x13, mi
+        and     x0, x10, x14
+        and     x1, x11, x15
+        add     x9, x0, x1
+        and     x0, x12, x16
+        and     x1, x13, x17
+        add     x19, x0, x1
+        ldr     x7, [sp]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #48]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x5, x19, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x7, [sp, #8]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #56]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [sp]
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [sp, #48]
+        ldr     x7, [sp, #16]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #64]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [sp, #8]
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [sp, #56]
+        ldr     x7, [sp, #24]
+        eor     x1, x7, x14
+        ldr     x23, [sp, #32]
+        eor     x3, x23, x14
+        and     x3, x3, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #72]
+        eor     x1, x8, x15
+        ldr     x24, [sp, #80]
+        eor     x0, x24, x15
+        and     x0, x0, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [sp, #16]
+        extr    x5, x3, x5, #59
+        str     x5, [sp, #24]
+        asr     x3, x3, #59
+        str     x3, [sp, #32]
+        eor     x1, x7, x16
+        eor     x5, x23, x16
+        and     x5, x5, x12
+        neg     x5, x5
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, x17
+        eor     x0, x24, x17
+        and     x0, x0, x13
+        sub     x5, x5, x0
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [sp, #64]
+        extr    x2, x5, x2, #59
+        str     x2, [sp, #72]
+        asr     x5, x5, #59
+        str     x5, [sp, #80]
+        ldr     x7, [sp, #96]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #128]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        str     x4, [sp, #96]
+        adc     x2, x2, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x5, x19, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x5, x5, x0
+        str     x5, [sp, #128]
+        adc     x3, x3, x1
+        ldr     x7, [sp, #104]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #136]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        str     x2, [sp, #104]
+        adc     x6, x6, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x3, x3, x0
+        str     x3, [sp, #136]
+        adc     x4, x4, x1
+        ldr     x7, [sp, #112]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #144]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        str     x6, [sp, #112]
+        adc     x5, x5, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x4, x4, x0
+        str     x4, [sp, #144]
+        adc     x2, x2, x1
+        ldr     x7, [sp, #120]
+        eor     x1, x7, x14
+        and     x3, x14, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #152]
+        eor     x1, x8, x15
+        and     x0, x15, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldp     x0, x1, [sp, #96]
+        ldr     x6, [sp, #112]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x6, x6, x11
+        mov     x10, #0x2000000000000000
+        adcs    x5, x5, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x3, x3, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x6, x6, x10
+        adcs    x5, x5, x14
+        adcs    x3, x3, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x6, x6, x11
+        sbcs    x5, x5, xzr
+        sbc     x3, x3, x10
+        stp     x1, x6, [sp, #96]
+        stp     x5, x3, [sp, #112]
+        eor     x1, x7, x16
+        and     x5, x16, x12
+        neg     x5, x5
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, x17
+        and     x0, x17, x13
+        sub     x5, x5, x0
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        ldp     x0, x1, [sp, #128]
+        ldr     x3, [sp, #144]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x3, x3, x11
+        mov     x10, #0x2000000000000000
+        adcs    x2, x2, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x5, x5, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x3, x3, x10
+        adcs    x2, x2, x14
+        adcs    x5, x5, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x3, x3, x11
+        sbcs    x2, x2, xzr
+        sbc     x5, x5, x10
+        stp     x1, x3, [sp, #128]
+        stp     x2, x5, [sp, #144]
+p256_scalarmulbase_inv_midloop:
+        mov     x1, x22
+        ldr     x2, [sp]
+        ldr     x3, [sp, #48]
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x8, x4, #0x100, lsl #12
+        sbfx    x8, x8, #21, #21
+        mov     x11, #0x100000
+        add     x11, x11, x11, lsl #21
+        add     x9, x4, x11
+        asr     x9, x9, #42
+        add     x10, x5, #0x100, lsl #12
+        sbfx    x10, x10, #21, #21
+        add     x11, x5, x11
+        asr     x11, x11, #42
+        mul     x6, x8, x2
+        mul     x7, x9, x3
+        mul     x2, x10, x2
+        mul     x3, x11, x3
+        add     x4, x6, x7
+        add     x5, x2, x3
+        asr     x2, x4, #20
+        asr     x3, x5, #20
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x12, x4, #0x100, lsl #12
+        sbfx    x12, x12, #21, #21
+        mov     x15, #0x100000
+        add     x15, x15, x15, lsl #21
+        add     x13, x4, x15
+        asr     x13, x13, #42
+        add     x14, x5, #0x100, lsl #12
+        sbfx    x14, x14, #21, #21
+        add     x15, x5, x15
+        asr     x15, x15, #42
+        mul     x6, x12, x2
+        mul     x7, x13, x3
+        mul     x2, x14, x2
+        mul     x3, x15, x3
+        add     x4, x6, x7
+        add     x5, x2, x3
+        asr     x2, x4, #20
+        asr     x3, x5, #20
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        mul     x2, x12, x8
+        mul     x3, x12, x9
+        mul     x6, x14, x8
+        mul     x7, x14, x9
+        madd    x8, x13, x10, x2
+        madd    x9, x13, x11, x3
+        madd    x16, x15, x10, x6
+        madd    x17, x15, x11, x7
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x12, x4, #0x100, lsl #12
+        sbfx    x12, x12, #22, #21
+        mov     x15, #0x100000
+        add     x15, x15, x15, lsl #21
+        add     x13, x4, x15
+        asr     x13, x13, #43
+        add     x14, x5, #0x100, lsl #12
+        sbfx    x14, x14, #22, #21
+        add     x15, x5, x15
+        asr     x15, x15, #43
+        mneg    x2, x12, x8
+        mneg    x3, x12, x9
+        mneg    x4, x14, x8
+        mneg    x5, x14, x9
+        msub    x10, x13, x16, x2
+        msub    x11, x13, x17, x3
+        msub    x12, x15, x16, x4
+        msub    x13, x15, x17, x5
+        mov     x22, x1
+        subs    x21, x21, #0x1
+        bne     p256_scalarmulbase_inv_loop
+        ldr     x0, [sp]
+        ldr     x1, [sp, #48]
+        mul     x0, x0, x10
+        madd    x1, x1, x11, x0
+        asr     x0, x1, #63
+        cmp     x10, xzr
+        csetm   x14, mi
+        cneg    x10, x10, mi
+        eor     x14, x14, x0
+        cmp     x11, xzr
+        csetm   x15, mi
+        cneg    x11, x11, mi
+        eor     x15, x15, x0
+        cmp     x12, xzr
+        csetm   x16, mi
+        cneg    x12, x12, mi
+        eor     x16, x16, x0
+        cmp     x13, xzr
+        csetm   x17, mi
+        cneg    x13, x13, mi
+        eor     x17, x17, x0
+        and     x0, x10, x14
+        and     x1, x11, x15
+        add     x9, x0, x1
+        ldr     x7, [sp, #96]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #128]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        str     x4, [sp, #96]
+        adc     x2, x2, x1
+        ldr     x7, [sp, #104]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #136]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        str     x2, [sp, #104]
+        adc     x6, x6, x1
+        ldr     x7, [sp, #112]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #144]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        str     x6, [sp, #112]
+        adc     x5, x5, x1
+        ldr     x7, [sp, #120]
+        eor     x1, x7, x14
+        and     x3, x14, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #152]
+        eor     x1, x8, x15
+        and     x0, x15, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldp     x0, x1, [sp, #96]
+        ldr     x2, [sp, #112]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x2, x2, x11
+        mov     x10, #0x2000000000000000
+        adcs    x5, x5, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x3, x3, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x2, x2, x10
+        adcs    x5, x5, x14
+        adcs    x3, x3, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x2, x2, x11
+        sbcs    x5, x5, xzr
+        sbc     x3, x3, x10
+        mov     x10, #0xffffffffffffffff
+        subs    x10, x1, x10
+        mov     x11, #0xffffffff
+        sbcs    x11, x2, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x5, xzr
+        sbcs    x13, x3, x13
+        csel    x10, x1, x10, cc
+        csel    x11, x2, x11, cc
+        csel    x12, x5, x12, cc
+        csel    x13, x3, x13, cc
+        stp     x10, x11, [x20]
+        stp     x12, x13, [x20, #16]
+        add     sp, sp, #0xa0
+        ldp     x23, x24, [sp], #16
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+        ret
+
+p256_scalarmulbase_local_montmul_p256:
+        ldr q20, [x2]
+        ldp x7, x17, [x1]
+        ldr q0, [x1]
+        ldp x6, x10, [x2]
+        ldp x11, x15, [x1, #16]
+        rev64 v16.4S, v20.4S
+        subs x4, x7, x17
+        csetm x3, cc
+        cneg x13, x4, cc
+        mul v16.4S, v16.4S, v0.4S
+        umulh x12, x17, x10
+        uzp1 v28.4S, v20.4S, v0.4S
+        subs x14, x11, x7
+        ldr q20, [x2, #16]
+        sbcs x5, x15, x17
+        ngc x17, xzr
+        subs x8, x11, x15
+        uaddlp v27.2D, v16.4S
+        umulh x4, x7, x6
+        uzp1 v21.4S, v0.4S, v0.4S
+        cneg x11, x8, cc
+        shl v17.2D, v27.2D, #32
+        csetm x15, cc
+        subs x9, x10, x6
+        eor x7, x14, x17
+        umlal v17.2D, v21.2S, v28.2S
+        cneg x8, x9, cc
+        cinv x9, x3, cc
+        cmn x17, #0x1
+        ldr q28, [x1, #16]
+        adcs x14, x7, xzr
+        mul x7, x13, x8
+        eor x1, x5, x17
+        adcs x5, x1, xzr
+        xtn v1.2S, v20.2D
+        mov x1, v17.d[0]
+        mov x3, v17.d[1]
+        uzp2 v16.4S, v20.4S, v20.4S
+        umulh x16, x13, x8
+        eor x13, x7, x9
+        adds x8, x1, x3
+        adcs x7, x4, x12
+        xtn v0.2S, v28.2D
+        adcs x12, x12, xzr
+        adds x8, x4, x8
+        adcs x3, x3, x7
+        ldp x7, x2, [x2, #16]
+        adcs x12, x12, xzr
+        cmn x9, #0x1
+        adcs x8, x8, x13
+        eor x13, x16, x9
+        adcs x16, x3, x13
+        lsl x3, x1, #32
+        adc x13, x12, x9
+        subs x12, x6, x7
+        sbcs x9, x10, x2
+        lsr x10, x1, #32
+        ngc x4, xzr
+        subs x6, x2, x7
+        cinv x2, x15, cc
+        cneg x6, x6, cc
+        subs x7, x1, x3
+        eor x9, x9, x4
+        sbc x1, x1, x10
+        adds x15, x8, x3
+        adcs x3, x16, x10
+        mul x16, x11, x6
+        adcs x8, x13, x7
+        eor x13, x12, x4
+        adc x10, x1, xzr
+        cmn x4, #0x1
+        umulh x6, x11, x6
+        adcs x11, x13, xzr
+        adcs x1, x9, xzr
+        lsl x13, x15, #32
+        subs x12, x15, x13
+        lsr x7, x15, #32
+        sbc x15, x15, x7
+        adds x9, x3, x13
+        adcs x3, x8, x7
+        umulh x8, x14, x11
+        umull v21.2D, v0.2S, v1.2S
+        adcs x12, x10, x12
+        umull v3.2D, v0.2S, v16.2S
+        adc x15, x15, xzr
+        rev64 v24.4S, v20.4S
+        stp x12, x15, [x0, #16]
+        movi v2.2D, #0x00000000ffffffff
+        mul x10, x14, x11
+        mul v4.4S, v24.4S, v28.4S
+        subs x13, x14, x5
+        uzp2 v19.4S, v28.4S, v28.4S
+        csetm x15, cc
+        usra v3.2D, v21.2D, #32
+        mul x7, x5, x1
+        umull v21.2D, v19.2S, v16.2S
+        cneg x13, x13, cc
+        uaddlp v5.2D, v4.4S
+        subs x11, x1, x11
+        and v16.16B, v3.16B, v2.16B
+        umulh x5, x5, x1
+        shl v24.2D, v5.2D, #32
+        cneg x11, x11, cc
+        umlal v16.2D, v19.2S, v1.2S
+        cinv x12, x15, cc
+        umlal v24.2D, v0.2S, v1.2S
+        adds x15, x10, x7
+        mul x14, x13, x11
+        eor x1, x6, x2
+        adcs x6, x8, x5
+        stp x9, x3, [x0]
+        usra v21.2D, v3.2D, #32
+        adcs x9, x5, xzr
+        umulh x11, x13, x11
+        adds x15, x8, x15
+        adcs x7, x7, x6
+        eor x8, x14, x12
+        usra v21.2D, v16.2D, #32
+        adcs x13, x9, xzr
+        cmn x12, #0x1
+        mov x9, v24.d[1]
+        adcs x14, x15, x8
+        eor x6, x11, x12
+        adcs x6, x7, x6
+        mov x5, v24.d[0]
+        mov x11, v21.d[1]
+        mov x7, v21.d[0]
+        adc x3, x13, x12
+        adds x12, x5, x9
+        adcs x13, x7, x11
+        ldp x15, x8, [x0]
+        adcs x11, x11, xzr
+        adds x12, x7, x12
+        eor x16, x16, x2
+        adcs x7, x9, x13
+        adcs x11, x11, xzr
+        cmn x2, #0x1
+        ldp x9, x13, [x0, #16]
+        adcs x16, x12, x16
+        adcs x1, x7, x1
+        adc x2, x11, x2
+        adds x7, x5, x15
+        adcs x15, x16, x8
+        eor x5, x17, x4
+        adcs x9, x1, x9
+        eor x1, x10, x5
+        adcs x16, x2, x13
+        adc x2, xzr, xzr
+        cmn x5, #0x1
+        eor x13, x14, x5
+        adcs x14, x1, x7
+        eor x1, x6, x5
+        adcs x6, x13, x15
+        adcs x10, x1, x9
+        eor x4, x3, x5
+        mov x1, #0xffffffff
+        adcs x8, x4, x16
+        lsr x13, x14, #32
+        adcs x17, x2, x5
+        adcs x11, x5, xzr
+        adc x4, x5, xzr
+        adds x12, x10, x7
+        adcs x7, x8, x15
+        adcs x5, x17, x9
+        adcs x9, x11, x16
+        lsl x11, x14, #32
+        adc x10, x4, x2
+        subs x17, x14, x11
+        sbc x4, x14, x13
+        adds x11, x6, x11
+        adcs x12, x12, x13
+        lsl x15, x11, #32
+        adcs x17, x7, x17
+        lsr x7, x11, #32
+        adc x13, x4, xzr
+        subs x4, x11, x15
+        sbc x11, x11, x7
+        adds x8, x12, x15
+        adcs x15, x17, x7
+        adcs x4, x13, x4
+        adc x11, x11, xzr
+        adds x7, x5, x4
+        adcs x17, x9, x11
+        adc x13, x10, xzr
+        add x12, x13, #0x1
+        neg x11, x12
+        lsl x4, x12, #32
+        adds x17, x17, x4
+        sub x4, x4, #0x1
+        adc x13, x13, xzr
+        subs x11, x8, x11
+        sbcs x4, x15, x4
+        sbcs x7, x7, xzr
+        sbcs x17, x17, x12
+        sbcs x13, x13, x12
+        mov x12, #0xffffffff00000001
+        adds x11, x11, x13
+        and x1, x1, x13
+        adcs x4, x4, x1
+        and x1, x12, x13
+        stp x11, x4, [x0]
+        adcs x4, x7, xzr
+        adc x1, x17, x1
+        stp x4, x1, [x0, #16]
+        ret
+
+p256_scalarmulbase_local_montsqr_p256:
+        ldr q19, [x1]
+        ldp x9, x13, [x1]
+        ldr q23, [x1, #16]
+        ldr q0, [x1]
+        ldp x1, x10, [x1, #16]
+        uzp2 v29.4S, v19.4S, v19.4S
+        xtn v4.2S, v19.2D
+        umulh x8, x9, x13
+        rev64 v20.4S, v23.4S
+        umull v16.2D, v19.2S, v19.2S
+        umull v1.2D, v29.2S, v4.2S
+        mul v20.4S, v20.4S, v0.4S
+        subs x14, x9, x13
+        umulh x15, x9, x1
+        mov x16, v16.d[1]
+        umull2 v4.2D, v19.4S, v19.4S
+        mov x4, v16.d[0]
+        uzp1 v17.4S, v23.4S, v0.4S
+        uaddlp v19.2D, v20.4S
+        lsr x7, x8, #63
+        mul x11, x9, x13
+        mov x12, v1.d[0]
+        csetm x5, cc
+        cneg x6, x14, cc
+        mov x3, v4.d[1]
+        mov x14, v4.d[0]
+        subs x2, x10, x1
+        mov x9, v1.d[1]
+        cneg x17, x2, cc
+        cinv x2, x5, cc
+        adds x5, x4, x12, lsl #33
+        extr x4, x8, x11, #63
+        lsr x8, x12, #31
+        uzp1 v20.4S, v0.4S, v0.4S
+        shl v19.2D, v19.2D, #32
+        adc x16, x16, x8
+        adds x8, x14, x9, lsl #33
+        lsr x14, x9, #31
+        lsl x9, x5, #32
+        umlal v19.2D, v20.2S, v17.2S
+        adc x14, x3, x14
+        adds x16, x16, x11, lsl #1
+        lsr x3, x5, #32
+        umulh x12, x6, x17
+        adcs x4, x8, x4
+        adc x11, x14, x7
+        subs x8, x5, x9
+        sbc x5, x5, x3
+        adds x16, x16, x9
+        mov x14, v19.d[0]
+        mul x17, x6, x17
+        adcs x3, x4, x3
+        lsl x7, x16, #32
+        umulh x13, x13, x10
+        adcs x11, x11, x8
+        lsr x8, x16, #32
+        adc x5, x5, xzr
+        subs x9, x16, x7
+        sbc x16, x16, x8
+        adds x7, x3, x7
+        mov x3, v19.d[1]
+        adcs x6, x11, x8
+        umulh x11, x1, x10
+        adcs x5, x5, x9
+        eor x8, x12, x2
+        adc x9, x16, xzr
+        adds x16, x14, x15
+        adc x15, x15, xzr
+        adds x12, x16, x3
+        eor x16, x17, x2
+        mul x4, x1, x10
+        adcs x15, x15, x13
+        adc x17, x13, xzr
+        adds x15, x15, x3
+        adc x3, x17, xzr
+        cmn x2, #0x1
+        mul x17, x10, x10
+        adcs x12, x12, x16
+        adcs x16, x15, x8
+        umulh x10, x10, x10
+        adc x2, x3, x2
+        adds x14, x14, x14
+        adcs x12, x12, x12
+        adcs x16, x16, x16
+        adcs x2, x2, x2
+        adc x15, xzr, xzr
+        adds x14, x14, x7
+        mul x3, x1, x1
+        adcs x12, x12, x6
+        lsr x7, x14, #32
+        adcs x16, x16, x5
+        lsl x5, x14, #32
+        umulh x13, x1, x1
+        adcs x2, x2, x9
+        mov x6, #0xffffffff
+        adc x15, x15, xzr
+        adds x8, x4, x4
+        adcs x1, x11, x11
+        mov x11, #0xffffffff00000001
+        adc x4, xzr, xzr
+        subs x9, x14, x5
+        sbc x14, x14, x7
+        adds x12, x12, x5
+        adcs x16, x16, x7
+        lsl x5, x12, #32
+        lsr x7, x12, #32
+        adcs x2, x2, x9
+        adcs x14, x15, x14
+        adc x15, xzr, xzr
+        subs x9, x12, x5
+        sbc x12, x12, x7
+        adds x16, x16, x5
+        adcs x2, x2, x7
+        adcs x14, x14, x9
+        adcs x12, x15, x12
+        adc x15, xzr, xzr
+        adds x16, x16, x3
+        adcs x2, x2, x13
+        adcs x14, x14, x17
+        adcs x12, x12, x10
+        adc x15, x15, xzr
+        adds x2, x2, x8
+        adcs x14, x14, x1
+        adcs x12, x12, x4
+        adcs x15, x15, xzr
+        adds x3, x16, #0x1
+        sbcs x5, x2, x6
+        sbcs x8, x14, xzr
+        sbcs x11, x12, x11
+        sbcs xzr, x15, xzr
+        csel x16, x3, x16, cs
+        csel x14, x8, x14, cs
+        csel x12, x11, x12, cs
+        csel x2, x5, x2, cs
+        stp x14, x12, [x0, #16]
+        stp x16, x2, [x0]
+        ret
+
+p256_scalarmulbase_local_p256_montjmixadd:
+        stp     x19, x20, [sp, #-16]!
+        sub     sp, sp, #0xc0
+        mov     x17, x0
+        mov     x19, x1
+        mov     x20, x2
+        ldp     x2, x3, [x19, #64]
+        ldp     x4, x5, [x19, #80]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        subs    x13, x15, x12
+        lsr     x11, x15, #32
+        sbc     x15, x15, x11
+        adds    x16, x16, x12
+        adcs    x0, x0, x11
+        adcs    x1, x1, x13
+        adc     x15, x15, xzr
+        lsl     x12, x16, #32
+        subs    x13, x16, x12
+        lsr     x11, x16, #32
+        sbc     x16, x16, x11
+        adds    x0, x0, x12
+        adcs    x1, x1, x11
+        adcs    x15, x15, x13
+        adc     x16, x16, xzr
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, cc
+        csetm   x13, cc
+        subs    x12, x5, x4
+        cneg    x12, x12, cc
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, cc
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        subs    x13, x6, x12
+        lsr     x11, x6, #32
+        sbc     x6, x6, x11
+        adds    x7, x7, x12
+        adcs    x8, x8, x11
+        adcs    x9, x9, x13
+        adcs    x10, x10, x6
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        subs    x13, x7, x12
+        lsr     x11, x7, #32
+        sbc     x7, x7, x11
+        adds    x8, x8, x12
+        adcs    x9, x9, x11
+        adcs    x10, x10, x13
+        adcs    x6, x6, x7
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #0xffffffff
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, cs
+        csel    x9, x11, x9, cs
+        csel    x10, x12, x10, cs
+        csel    x6, x13, x6, cs
+        stp     x8, x9, [sp]
+        stp     x10, x6, [sp, #16]
+        ldp     x3, x4, [x19, #64]
+        ldp     x5, x6, [x19, #80]
+        ldp     x7, x8, [x20, #32]
+        ldp     x9, x10, [x20, #48]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #32]
+        stp     x11, x12, [sp, #48]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #32]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #48]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #32]
+        stp     x3, x4, [sp, #48]
+        ldp     x3, x4, [sp]
+        ldp     x5, x6, [sp, #16]
+        ldp     x7, x8, [x20]
+        ldp     x9, x10, [x20, #16]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #64]
+        stp     x11, x12, [sp, #80]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #64]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #80]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #64]
+        stp     x3, x4, [sp, #80]
+        ldp     x3, x4, [sp]
+        ldp     x5, x6, [sp, #16]
+        ldp     x7, x8, [sp, #32]
+        ldp     x9, x10, [sp, #48]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #32]
+        stp     x11, x12, [sp, #48]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #32]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #48]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #32]
+        stp     x3, x4, [sp, #48]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [x19]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [x19, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #160]
+        stp     x7, x8, [sp, #176]
+        ldp     x5, x6, [sp, #32]
+        ldp     x4, x3, [x19, #32]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #48]
+        ldp     x4, x3, [x19, #48]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #32]
+        stp     x7, x8, [sp, #48]
+        ldp     x2, x3, [sp, #160]
+        ldp     x4, x5, [sp, #176]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        subs    x13, x15, x12
+        lsr     x11, x15, #32
+        sbc     x15, x15, x11
+        adds    x16, x16, x12
+        adcs    x0, x0, x11
+        adcs    x1, x1, x13
+        adc     x15, x15, xzr
+        lsl     x12, x16, #32
+        subs    x13, x16, x12
+        lsr     x11, x16, #32
+        sbc     x16, x16, x11
+        adds    x0, x0, x12
+        adcs    x1, x1, x11
+        adcs    x15, x15, x13
+        adc     x16, x16, xzr
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, cc
+        csetm   x13, cc
+        subs    x12, x5, x4
+        cneg    x12, x12, cc
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, cc
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        subs    x13, x6, x12
+        lsr     x11, x6, #32
+        sbc     x6, x6, x11
+        adds    x7, x7, x12
+        adcs    x8, x8, x11
+        adcs    x9, x9, x13
+        adcs    x10, x10, x6
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        subs    x13, x7, x12
+        lsr     x11, x7, #32
+        sbc     x7, x7, x11
+        adds    x8, x8, x12
+        adcs    x9, x9, x11
+        adcs    x10, x10, x13
+        adcs    x6, x6, x7
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #0xffffffff
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, cs
+        csel    x9, x11, x9, cs
+        csel    x10, x12, x10, cs
+        csel    x6, x13, x6, cs
+        stp     x8, x9, [sp, #96]
+        stp     x10, x6, [sp, #112]
+        ldp     x2, x3, [sp, #32]
+        ldp     x4, x5, [sp, #48]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        subs    x13, x15, x12
+        lsr     x11, x15, #32
+        sbc     x15, x15, x11
+        adds    x16, x16, x12
+        adcs    x0, x0, x11
+        adcs    x1, x1, x13
+        adc     x15, x15, xzr
+        lsl     x12, x16, #32
+        subs    x13, x16, x12
+        lsr     x11, x16, #32
+        sbc     x16, x16, x11
+        adds    x0, x0, x12
+        adcs    x1, x1, x11
+        adcs    x15, x15, x13
+        adc     x16, x16, xzr
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, cc
+        csetm   x13, cc
+        subs    x12, x5, x4
+        cneg    x12, x12, cc
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, cc
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        subs    x13, x6, x12
+        lsr     x11, x6, #32
+        sbc     x6, x6, x11
+        adds    x7, x7, x12
+        adcs    x8, x8, x11
+        adcs    x9, x9, x13
+        adcs    x10, x10, x6
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        subs    x13, x7, x12
+        lsr     x11, x7, #32
+        sbc     x7, x7, x11
+        adds    x8, x8, x12
+        adcs    x9, x9, x11
+        adcs    x10, x10, x13
+        adcs    x6, x6, x7
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #0xffffffff
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, cs
+        csel    x9, x11, x9, cs
+        csel    x10, x12, x10, cs
+        csel    x6, x13, x6, cs
+        stp     x8, x9, [sp]
+        stp     x10, x6, [sp, #16]
+        ldp     x3, x4, [sp, #96]
+        ldp     x5, x6, [sp, #112]
+        ldp     x7, x8, [x19]
+        ldp     x9, x10, [x19, #16]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #128]
+        stp     x11, x12, [sp, #144]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #128]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #144]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #128]
+        stp     x3, x4, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x5, x6, [sp, #112]
+        ldp     x7, x8, [sp, #64]
+        ldp     x9, x10, [sp, #80]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #64]
+        stp     x11, x12, [sp, #80]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #64]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #80]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #64]
+        stp     x3, x4, [sp, #80]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #96]
+        stp     x7, x8, [sp, #112]
+        ldp     x3, x4, [sp, #160]
+        ldp     x5, x6, [sp, #176]
+        ldp     x7, x8, [x19, #64]
+        ldp     x9, x10, [x19, #80]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #160]
+        stp     x11, x12, [sp, #176]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #160]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #176]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #160]
+        stp     x3, x4, [sp, #176]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #64]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #80]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x5, x6, [sp, #112]
+        ldp     x7, x8, [x19, #32]
+        ldp     x9, x10, [x19, #48]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #96]
+        stp     x11, x12, [sp, #112]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #96]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #112]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #96]
+        stp     x3, x4, [sp, #112]
+        ldp     x3, x4, [sp, #32]
+        ldp     x5, x6, [sp, #48]
+        ldp     x7, x8, [sp, #128]
+        ldp     x9, x10, [sp, #144]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        stp     x13, x14, [sp, #128]
+        stp     x11, x12, [sp, #144]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, cc
+        csetm   x1, cc
+        subs    x0, x10, x9
+        cneg    x0, x0, cc
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, cc
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #128]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #144]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, cc
+        csetm   x4, cc
+        subs    x0, x8, x7
+        cneg    x0, x0, cc
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, cc
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x0, x11, #32
+        subs    x1, x11, x0
+        lsr     x16, x11, #32
+        sbc     x11, x11, x16
+        adds    x12, x12, x0
+        adcs    x13, x13, x16
+        adcs    x14, x14, x1
+        adc     x11, x11, xzr
+        lsl     x0, x12, #32
+        subs    x1, x12, x0
+        lsr     x16, x12, #32
+        sbc     x12, x12, x16
+        adds    x13, x13, x0
+        adcs    x14, x14, x16
+        adcs    x11, x11, x1
+        adc     x12, x12, xzr
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x16, x2, #32
+        adds    x4, x4, x16
+        adc     x10, x10, xzr
+        neg     x15, x2
+        sub     x16, x16, #0x1
+        subs    x13, x13, x15
+        sbcs    x14, x14, x16
+        sbcs    x3, x3, xzr
+        sbcs    x4, x4, x2
+        sbcs    x7, x10, x2
+        adds    x13, x13, x7
+        mov     x10, #0xffffffff
+        and     x10, x10, x7
+        adcs    x14, x14, x10
+        adcs    x3, x3, xzr
+        mov     x10, #0xffffffff00000001
+        and     x10, x10, x7
+        adc     x4, x4, x10
+        stp     x13, x14, [sp, #128]
+        stp     x3, x4, [sp, #144]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp, #96]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #112]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x0, x1, [x19, #64]
+        ldp     x2, x3, [x19, #80]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+        ldp     x0, x1, [sp]
+        ldp     x12, x13, [x20]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [sp, #16]
+        ldp     x12, x13, [x20, #16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+        ldp     x4, x5, [sp, #128]
+        ldp     x12, x13, [x20, #32]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [sp, #144]
+        ldp     x12, x13, [x20, #48]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+        ldp     x8, x9, [sp, #160]
+        mov     x12, #0x1
+        mov     x13, #0xffffffff00000000
+        csel    x8, x8, x12, ne
+        csel    x9, x9, x13, ne
+        ldp     x10, x11, [sp, #176]
+        mov     x12, #0xffffffffffffffff
+        mov     x13, #0xfffffffe
+        csel    x10, x10, x12, ne
+        csel    x11, x11, x13, ne
+        stp     x0, x1, [x17]
+        stp     x2, x3, [x17, #16]
+        stp     x4, x5, [x17, #32]
+        stp     x6, x7, [x17, #48]
+        stp     x8, x9, [x17, #64]
+        stp     x10, x11, [x17, #80]
+        add     sp, sp, #0xc0
+        ldp     x19, x20, [sp], #16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase_alt.S
new file mode 100644
index 00000000000..cb4a278d446
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase_alt.S
@@ -0,0 +1,3026 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Scalar multiplication for precomputed point on NIST curve P-256
+// Input scalar[4], blocksize, table[]; output res[8]
+//
+// extern void p256_scalarmulbase_alt
+//   (uint64_t res[static 8],
+//    uint64_t scalar[static 4],
+//    uint64_t blocksize,
+//    uint64_t *table);
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve P-256, the input argument "table" is expected to be a table of
+// multiples of the point P in Montgomery-affine form, with each block
+// corresponding to "blocksize" bits of the scalar as follows, where
+// B = 2^{blocksize-1} (e.g. B = 8 for blocksize = 4):
+//
+// For each i,j with blocksize * i <= 256 and 1 <= j <= B
+// the multiple 2^{blocksize * i} * j * P is stored at
+// tab[8 * (B * i + (j - 1))], considered as uint64_t pointers
+// or tab + 64 * (B * i + (j - 1)) as byte pointers.
+//
+// Standard ARM ABI: X0 = res, X1 = scalar, X2 = blocksize, X3 = table
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmulbase_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmulbase_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Safe copies of inputs and additional variables, with some aliasing
+
+#define res x19
+#define blocksize x20
+#define table x21
+#define i x22
+#define bf x23
+#define cf x24
+#define j x25
+
+// Intermediate variables on the stack. The last z2, z3 values can
+// safely be overlaid on "nacc", which is no longer needed at the end.
+// Uppercase syntactic variants make x86_att version simpler to generate
+
+#define rscalar sp, #(0*NUMSIZE)
+#define acc sp, #(1*NUMSIZE)
+#define nacc sp, #(4*NUMSIZE)
+#define tabent sp, #(7*NUMSIZE)
+
+#define z2 sp, #(4*NUMSIZE)
+#define z3 sp, #(5*NUMSIZE)
+
+#define NSPACE #(9*NUMSIZE)
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(p256_scalarmulbase_alt):
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x30, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Preserve the input arguments except the scalar, since that gets absorbed
+// immediately. The "table" value subsequently gets shifted up each iteration
+// of the loop, while "res" and "blocksize" are static throughout.
+
+        mov     res, x0
+        mov     blocksize, x2
+        mov     table, x3
+
+// Load the digits of group order n_256 = [x15;x14;x13;x12]
+
+        movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551)
+        movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84)
+        mov     x14, #0xffffffffffffffff
+        mov     x15, #0xffffffff00000000
+
+// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256
+// Store it to "rscalar" (reduced scalar)
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+
+        subs    x6, x2, x12
+        sbcs    x7, x3, x13
+        sbcs    x8, x4, x14
+        sbcs    x9, x5, x15
+
+        csel    x2, x2, x6, cc
+        csel    x3, x3, x7, cc
+        csel    x4, x4, x8, cc
+        csel    x5, x5, x9, cc
+
+        stp     x2, x3, [rscalar]
+        stp     x4, x5, [rscalar+16]
+
+// Initialize the accumulator to all zeros and the "carry flag" cf to 0
+
+        stp     xzr, xzr, [acc]
+        stp     xzr, xzr, [acc+16]
+        stp     xzr, xzr, [acc+32]
+        stp     xzr, xzr, [acc+48]
+        stp     xzr, xzr, [acc+64]
+        stp     xzr, xzr, [acc+80]
+        mov     cf, xzr
+
+// Main loop over {i >= 0 | blocksize * i <= 256}. Note the non-strict
+// inequality, to allow top carry for any choices of blocksize.
+
+        mov     i, xzr
+
+p256_scalarmulbase_alt_loop:
+
+// The next raw bitfield is bf = bitfield(blocksize * i,blocksize) + cf,
+// adding in the deferred carry cf. We then shift the whole scalar right
+// by blocksize so we can keep picking bitfield(0,blocksize).
+
+        ldp     x0, x1, [rscalar]
+        ldp     x2, x3, [rscalar+16]
+
+        mov     x4, #1
+        lsl     x4, x4, blocksize
+        sub     x4, x4, #1
+        and     x4, x4, x0
+        add     bf, x4, cf
+
+        neg     x8, blocksize
+
+        lsl     x5, x1, x8
+
+        lsr     x0, x0, blocksize
+        orr     x0, x0,  x5
+
+        lsl     x6, x2, x8
+        lsr     x1, x1, blocksize
+        orr     x1, x1, x6
+
+        lsl     x7, x3, x8
+        lsr     x2, x2, blocksize
+        orr     x2, x2, x7
+
+        lsr     x3, x3, blocksize
+
+        stp     x0, x1, [rscalar]
+        stp     x2, x3, [rscalar+16]
+
+// Now if bf <= B we just select entry j, unnegated and set cf = 0.
+// If bf > B we set j = 2 * B - bf and negate the j'th entry, setting cf = 1.
+// In either case we ultimately add bf, in the latter case with deferred
+// carry as 2 * B - (2 * B - bf) = bf.
+
+        mov     x0, #1
+        lsl     x1, x0, blocksize
+        lsr     x0, x1, #1
+
+        sub     x2, x1, bf
+
+        cmp     x0, bf
+        cset    cf, cc
+        csel    j, x2, bf, cc
+
+// Load table entry j - 1 for nonzero j in constant-time style.
+
+        mov     x16, #1
+        lsl     x16, x16, blocksize
+        lsr     x16, x16, #1
+        mov     x17, j
+
+p256_scalarmulbase_alt_tabloop:
+        ldp     x8, x9, [table]
+        ldp     x10, x11, [table, #16]
+        ldp     x12, x13, [table, #32]
+        ldp     x14, x15, [table, #48]
+
+        subs    x17, x17, #1
+        csel    x0, x8, x0, eq
+        csel    x1, x9, x1, eq
+        csel    x2, x10, x2, eq
+        csel    x3, x11, x3, eq
+        csel    x4, x12, x4, eq
+        csel    x5, x13, x5, eq
+        csel    x6, x14, x6, eq
+        csel    x7, x15, x7, eq
+
+        add     table, table, #64
+
+        sub     x16, x16, #1
+        cbnz    x16, p256_scalarmulbase_alt_tabloop
+
+// Before storing back, optionally negate the y coordinate of the table entry
+
+        stp     x0, x1, [tabent]
+        stp     x2, x3, [tabent+16]
+
+        mov     x0, 0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, 0x00000000ffffffff
+        sbcs    x1, x1, x5
+        mov     x3, 0xffffffff00000001
+        sbcs    x2, xzr, x6
+        sbc     x3, x3, x7
+
+        cmp     cf, xzr
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tabent+32]
+        stp     x6, x7, [tabent+48]
+
+// Add the adjusted table point to the accumulator
+
+        add     x0, nacc
+        add     x1, acc
+        add     x2, tabent
+        bl      p256_scalarmulbase_alt_local_p256_montjmixadd
+
+// However, only commit that update to the accumulator if j is nonzero,
+// because the mixed addition function does not handle this case directly,
+// and in any case we didn't choose the table entry appropriately.
+
+        cmp     j, xzr
+        ldp     x0, x1, [acc]
+        ldp     x12, x13, [nacc]
+        csel    x0, x12, x0, ne
+        csel    x1, x13, x1, ne
+
+        ldp     x2, x3, [acc+16]
+        ldp     x12, x13, [nacc+16]
+        csel    x2, x12, x2, ne
+        csel    x3, x13, x3, ne
+
+        ldp     x4, x5, [acc+32]
+        ldp     x12, x13, [nacc+32]
+        csel    x4, x12, x4, ne
+        csel    x5, x13, x5, ne
+
+        ldp     x6, x7, [acc+48]
+        ldp     x12, x13, [nacc+48]
+        csel    x6, x12, x6, ne
+        csel    x7, x13, x7, ne
+
+        ldp     x8, x9, [acc+64]
+        ldp     x12, x13, [nacc+64]
+        csel    x8, x12, x8, ne
+        csel    x9, x13, x9, ne
+
+        ldp     x10, x11, [acc+80]
+        ldp     x12, x13, [nacc+80]
+        csel    x10, x12, x10, ne
+        csel    x11, x13, x11, ne
+
+        stp     x0, x1, [acc]
+        stp     x2, x3, [acc+16]
+        stp     x4, x5, [acc+32]
+        stp     x6, x7, [acc+48]
+        stp     x8, x9, [acc+64]
+        stp     x10, x11, [acc+80]
+
+// Loop while blocksize * i <= 256
+
+        add     i, i, #1
+        mul     x0, blocksize, i
+        cmp     x0, #257
+        bcc     p256_scalarmulbase_alt_loop
+
+// That's the end of the main loop, and we just need to translate
+// back from the Jacobian representation to affine. First of all,
+// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form
+
+        add     x0, z2
+        add     x1, acc+64
+        bl      p256_scalarmulbase_alt_local_montsqr_p256
+
+        add     x0, z3
+        add     x1, acc+64
+        add     x2, z2
+        bl      p256_scalarmulbase_alt_local_montmul_p256
+
+        add     x0, z2
+        add     x1, z3
+        bl      p256_scalarmulbase_alt_local_demont_p256
+
+        add     x0, z3
+        add     x1, z2
+        bl      p256_scalarmulbase_alt_local_inv_p256
+
+        add     x0, z2
+        add     x1, acc+64
+        add     x2, z3
+        bl      p256_scalarmulbase_alt_local_montmul_p256
+
+// Convert back from Jacobian (X,Y,Z) |-> (X/Z^2, Y/Z^3)
+
+        mov     x0, res
+        add     x1, acc
+        add     x2, z2
+        bl      p256_scalarmulbase_alt_local_montmul_p256
+
+        add     x0, res, #32
+        add     x1, acc+32
+        add     x2, z3
+        bl      p256_scalarmulbase_alt_local_montmul_p256
+
+// Restore stack and registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x25, x30, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+p256_scalarmulbase_alt_local_demont_p256:
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+        lsl     x7, x2, #32
+        subs    x8, x2, x7
+        lsr     x6, x2, #32
+        sbc     x2, x2, x6
+        adds    x3, x3, x7
+        adcs    x4, x4, x6
+        adcs    x5, x5, x8
+        adc     x2, x2, xzr
+        lsl     x7, x3, #32
+        subs    x8, x3, x7
+        lsr     x6, x3, #32
+        sbc     x3, x3, x6
+        adds    x4, x4, x7
+        adcs    x5, x5, x6
+        adcs    x2, x2, x8
+        adc     x3, x3, xzr
+        lsl     x7, x4, #32
+        subs    x8, x4, x7
+        lsr     x6, x4, #32
+        sbc     x4, x4, x6
+        adds    x5, x5, x7
+        adcs    x2, x2, x6
+        adcs    x3, x3, x8
+        adc     x4, x4, xzr
+        lsl     x7, x5, #32
+        subs    x8, x5, x7
+        lsr     x6, x5, #32
+        sbc     x5, x5, x6
+        adds    x2, x2, x7
+        adcs    x3, x3, x6
+        adcs    x4, x4, x8
+        adc     x5, x5, xzr
+        stp     x2, x3, [x0]
+        stp     x4, x5, [x0, #16]
+        ret
+
+p256_scalarmulbase_alt_local_inv_p256:
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        sub     sp, sp, #0xa0
+        mov     x20, x0
+        mov     x10, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x13, #0xffffffff00000001
+        stp     x10, x11, [sp]
+        stp     xzr, x13, [sp, #16]
+        str     xzr, [sp, #32]
+        ldp     x2, x3, [x1]
+        subs    x10, x2, x10
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #16]
+        sbcs    x12, x4, xzr
+        sbcs    x13, x5, x13
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+        stp     x2, x3, [sp, #48]
+        stp     x4, x5, [sp, #64]
+        str     xzr, [sp, #80]
+        stp     xzr, xzr, [sp, #96]
+        stp     xzr, xzr, [sp, #112]
+        mov     x10, #0x4000000000000
+        stp     x10, xzr, [sp, #128]
+        stp     xzr, xzr, [sp, #144]
+        mov     x21, #0xa
+        mov     x22, #0x1
+        b       p256_scalarmulbase_alt_inv_midloop
+p256_scalarmulbase_alt_inv_loop:
+        cmp     x10, xzr
+        csetm   x14, mi
+        cneg    x10, x10, mi
+        cmp     x11, xzr
+        csetm   x15, mi
+        cneg    x11, x11, mi
+        cmp     x12, xzr
+        csetm   x16, mi
+        cneg    x12, x12, mi
+        cmp     x13, xzr
+        csetm   x17, mi
+        cneg    x13, x13, mi
+        and     x0, x10, x14
+        and     x1, x11, x15
+        add     x9, x0, x1
+        and     x0, x12, x16
+        and     x1, x13, x17
+        add     x19, x0, x1
+        ldr     x7, [sp]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #48]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x5, x19, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x7, [sp, #8]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #56]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [sp]
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [sp, #48]
+        ldr     x7, [sp, #16]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #64]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [sp, #8]
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [sp, #56]
+        ldr     x7, [sp, #24]
+        eor     x1, x7, x14
+        ldr     x23, [sp, #32]
+        eor     x3, x23, x14
+        and     x3, x3, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #72]
+        eor     x1, x8, x15
+        ldr     x24, [sp, #80]
+        eor     x0, x24, x15
+        and     x0, x0, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [sp, #16]
+        extr    x5, x3, x5, #59
+        str     x5, [sp, #24]
+        asr     x3, x3, #59
+        str     x3, [sp, #32]
+        eor     x1, x7, x16
+        eor     x5, x23, x16
+        and     x5, x5, x12
+        neg     x5, x5
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, x17
+        eor     x0, x24, x17
+        and     x0, x0, x13
+        sub     x5, x5, x0
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [sp, #64]
+        extr    x2, x5, x2, #59
+        str     x2, [sp, #72]
+        asr     x5, x5, #59
+        str     x5, [sp, #80]
+        ldr     x7, [sp, #96]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #128]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        str     x4, [sp, #96]
+        adc     x2, x2, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x5, x19, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x5, x5, x0
+        str     x5, [sp, #128]
+        adc     x3, x3, x1
+        ldr     x7, [sp, #104]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #136]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        str     x2, [sp, #104]
+        adc     x6, x6, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x3, x3, x0
+        str     x3, [sp, #136]
+        adc     x4, x4, x1
+        ldr     x7, [sp, #112]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #144]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        str     x6, [sp, #112]
+        adc     x5, x5, x1
+        eor     x1, x7, x16
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, x17
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x4, x4, x0
+        str     x4, [sp, #144]
+        adc     x2, x2, x1
+        ldr     x7, [sp, #120]
+        eor     x1, x7, x14
+        and     x3, x14, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #152]
+        eor     x1, x8, x15
+        and     x0, x15, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldp     x0, x1, [sp, #96]
+        ldr     x6, [sp, #112]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x6, x6, x11
+        mov     x10, #0x2000000000000000
+        adcs    x5, x5, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x3, x3, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x6, x6, x10
+        adcs    x5, x5, x14
+        adcs    x3, x3, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x6, x6, x11
+        sbcs    x5, x5, xzr
+        sbc     x3, x3, x10
+        stp     x1, x6, [sp, #96]
+        stp     x5, x3, [sp, #112]
+        eor     x1, x7, x16
+        and     x5, x16, x12
+        neg     x5, x5
+        mul     x0, x1, x12
+        umulh   x1, x1, x12
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, x17
+        and     x0, x17, x13
+        sub     x5, x5, x0
+        mul     x0, x1, x13
+        umulh   x1, x1, x13
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        ldp     x0, x1, [sp, #128]
+        ldr     x3, [sp, #144]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x3, x3, x11
+        mov     x10, #0x2000000000000000
+        adcs    x2, x2, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x5, x5, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x3, x3, x10
+        adcs    x2, x2, x14
+        adcs    x5, x5, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x3, x3, x11
+        sbcs    x2, x2, xzr
+        sbc     x5, x5, x10
+        stp     x1, x3, [sp, #128]
+        stp     x2, x5, [sp, #144]
+p256_scalarmulbase_alt_inv_midloop:
+        mov     x1, x22
+        ldr     x2, [sp]
+        ldr     x3, [sp, #48]
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x8, x4, #0x100, lsl #12
+        sbfx    x8, x8, #21, #21
+        mov     x11, #0x100000
+        add     x11, x11, x11, lsl #21
+        add     x9, x4, x11
+        asr     x9, x9, #42
+        add     x10, x5, #0x100, lsl #12
+        sbfx    x10, x10, #21, #21
+        add     x11, x5, x11
+        asr     x11, x11, #42
+        mul     x6, x8, x2
+        mul     x7, x9, x3
+        mul     x2, x10, x2
+        mul     x3, x11, x3
+        add     x4, x6, x7
+        add     x5, x2, x3
+        asr     x2, x4, #20
+        asr     x3, x5, #20
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x12, x4, #0x100, lsl #12
+        sbfx    x12, x12, #21, #21
+        mov     x15, #0x100000
+        add     x15, x15, x15, lsl #21
+        add     x13, x4, x15
+        asr     x13, x13, #42
+        add     x14, x5, #0x100, lsl #12
+        sbfx    x14, x14, #21, #21
+        add     x15, x5, x15
+        asr     x15, x15, #42
+        mul     x6, x12, x2
+        mul     x7, x13, x3
+        mul     x2, x14, x2
+        mul     x3, x15, x3
+        add     x4, x6, x7
+        add     x5, x2, x3
+        asr     x2, x4, #20
+        asr     x3, x5, #20
+        and     x4, x2, #0xfffff
+        orr     x4, x4, #0xfffffe0000000000
+        and     x5, x3, #0xfffff
+        orr     x5, x5, #0xc000000000000000
+        tst     x5, #0x1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        mul     x2, x12, x8
+        mul     x3, x12, x9
+        mul     x6, x14, x8
+        mul     x7, x14, x9
+        madd    x8, x13, x10, x2
+        madd    x9, x13, x11, x3
+        madd    x16, x15, x10, x6
+        madd    x17, x15, x11, x7
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        tst     x5, #0x2
+        asr     x5, x5, #1
+        csel    x6, x4, xzr, ne
+        ccmp    x1, xzr, #0x8, ne
+        cneg    x1, x1, ge
+        cneg    x6, x6, ge
+        csel    x4, x5, x4, ge
+        add     x5, x5, x6
+        add     x1, x1, #0x2
+        asr     x5, x5, #1
+        add     x12, x4, #0x100, lsl #12
+        sbfx    x12, x12, #22, #21
+        mov     x15, #0x100000
+        add     x15, x15, x15, lsl #21
+        add     x13, x4, x15
+        asr     x13, x13, #43
+        add     x14, x5, #0x100, lsl #12
+        sbfx    x14, x14, #22, #21
+        add     x15, x5, x15
+        asr     x15, x15, #43
+        mneg    x2, x12, x8
+        mneg    x3, x12, x9
+        mneg    x4, x14, x8
+        mneg    x5, x14, x9
+        msub    x10, x13, x16, x2
+        msub    x11, x13, x17, x3
+        msub    x12, x15, x16, x4
+        msub    x13, x15, x17, x5
+        mov     x22, x1
+        subs    x21, x21, #0x1
+        bne     p256_scalarmulbase_alt_inv_loop
+        ldr     x0, [sp]
+        ldr     x1, [sp, #48]
+        mul     x0, x0, x10
+        madd    x1, x1, x11, x0
+        asr     x0, x1, #63
+        cmp     x10, xzr
+        csetm   x14, mi
+        cneg    x10, x10, mi
+        eor     x14, x14, x0
+        cmp     x11, xzr
+        csetm   x15, mi
+        cneg    x11, x11, mi
+        eor     x15, x15, x0
+        cmp     x12, xzr
+        csetm   x16, mi
+        cneg    x12, x12, mi
+        eor     x16, x16, x0
+        cmp     x13, xzr
+        csetm   x17, mi
+        cneg    x13, x13, mi
+        eor     x17, x17, x0
+        and     x0, x10, x14
+        and     x1, x11, x15
+        add     x9, x0, x1
+        ldr     x7, [sp, #96]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x4, x9, x0
+        adc     x2, xzr, x1
+        ldr     x8, [sp, #128]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x4, x4, x0
+        str     x4, [sp, #96]
+        adc     x2, x2, x1
+        ldr     x7, [sp, #104]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [sp, #136]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x2, x2, x0
+        str     x2, [sp, #104]
+        adc     x6, x6, x1
+        ldr     x7, [sp, #112]
+        eor     x1, x7, x14
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [sp, #144]
+        eor     x1, x8, x15
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x6, x6, x0
+        str     x6, [sp, #112]
+        adc     x5, x5, x1
+        ldr     x7, [sp, #120]
+        eor     x1, x7, x14
+        and     x3, x14, x10
+        neg     x3, x3
+        mul     x0, x1, x10
+        umulh   x1, x1, x10
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [sp, #152]
+        eor     x1, x8, x15
+        and     x0, x15, x11
+        sub     x3, x3, x0
+        mul     x0, x1, x11
+        umulh   x1, x1, x11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldp     x0, x1, [sp, #96]
+        ldr     x2, [sp, #112]
+        mov     x14, #0xe000000000000000
+        adds    x0, x0, x14
+        sbcs    x1, x1, xzr
+        mov     x11, #0x1fffffff
+        adcs    x2, x2, x11
+        mov     x10, #0x2000000000000000
+        adcs    x5, x5, x10
+        mov     x14, #0x1fffffffe0000000
+        adc     x3, x3, x14
+        lsl     x11, x0, #32
+        subs    x14, x0, x11
+        lsr     x10, x0, #32
+        sbc     x0, x0, x10
+        adds    x1, x1, x11
+        adcs    x2, x2, x10
+        adcs    x5, x5, x14
+        adcs    x3, x3, x0
+        mov     x14, #0xffffffffffffffff
+        mov     x11, #0xffffffff
+        mov     x10, #0xffffffff00000001
+        csel    x14, x14, xzr, cs
+        csel    x11, x11, xzr, cs
+        csel    x10, x10, xzr, cs
+        subs    x1, x1, x14
+        sbcs    x2, x2, x11
+        sbcs    x5, x5, xzr
+        sbc     x3, x3, x10
+        mov     x10, #0xffffffffffffffff
+        subs    x10, x1, x10
+        mov     x11, #0xffffffff
+        sbcs    x11, x2, x11
+        mov     x13, #0xffffffff00000001
+        sbcs    x12, x5, xzr
+        sbcs    x13, x3, x13
+        csel    x10, x1, x10, cc
+        csel    x11, x2, x11, cc
+        csel    x12, x5, x12, cc
+        csel    x13, x3, x13, cc
+        stp     x10, x11, [x20]
+        stp     x12, x13, [x20, #16]
+        add     sp, sp, #0xa0
+        ldp     x23, x24, [sp], #16
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+        ret
+
+p256_scalarmulbase_alt_local_montmul_p256:
+        ldp     x3, x4, [x1]
+        ldp     x7, x8, [x2]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x2, #16]
+        mul     x11, x3, x9
+        umulh   x15, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x16, x3, x10
+        adcs    x15, x15, x11
+        adc     x16, x16, xzr
+        ldp     x5, x6, [x1, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x15, x15, x11
+        mul     x11, x4, x10
+        adcs    x16, x16, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x15, x15, x11
+        umulh   x11, x4, x9
+        adcs    x16, x16, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x15, x15, x11
+        mul     x11, x5, x9
+        adcs    x16, x16, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x15, x15, x11
+        umulh   x11, x5, x8
+        adcs    x16, x16, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x15, x15, x11
+        mul     x11, x6, x8
+        adcs    x16, x16, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x15, x15, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x16, x16, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x15, x15, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x15, x15, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x15, lsl #32
+        lsr     x11, x15, #32
+        adcs    x13, x13, x11
+        mul     x11, x15, x10
+        umulh   x15, x15, x10
+        adcs    x14, x14, x11
+        adc     x15, x15, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x15, x15, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x16, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x15, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x16, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x15, x15, x5, cc
+        stp     x12, x13, [x0]
+        stp     x14, x15, [x0, #16]
+        ret
+
+p256_scalarmulbase_alt_local_montsqr_p256:
+        ldp     x2, x3, [x1]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x1, #16]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        mov     x5, #0xffffffff00000001
+        adds    x9, x9, x8, lsl #32
+        lsr     x2, x8, #32
+        adcs    x10, x10, x2
+        mul     x2, x8, x5
+        umulh   x8, x8, x5
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x2, x9, #32
+        adcs    x11, x11, x2
+        mul     x2, x9, x5
+        umulh   x9, x9, x5
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x2, x10, #32
+        adcs    x8, x8, x2
+        mul     x2, x10, x5
+        umulh   x10, x10, x5
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x2, x11, #32
+        adcs    x9, x9, x2
+        mul     x2, x11, x5
+        umulh   x11, x11, x5
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [x0]
+        stp     x10, x11, [x0, #16]
+        ret
+
+p256_scalarmulbase_alt_local_p256_montjmixadd:
+        sub     sp, sp, #0xc0
+        mov     x15, x0
+        mov     x16, x1
+        mov     x17, x2
+        ldp     x2, x3, [x16, #64]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x16, #80]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        mov     x2, #0xffffffffffffffff
+        csel    x2, xzr, x2, cc
+        mov     x3, #0xffffffff
+        csel    x3, xzr, x3, cc
+        mov     x5, #0xffffffff00000001
+        csel    x5, xzr, x5, cc
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, xzr
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #16]
+        ldp     x3, x4, [x16, #64]
+        ldp     x7, x8, [x17, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [x16, #80]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #32]
+        stp     x14, x0, [sp, #48]
+        ldp     x3, x4, [sp]
+        ldp     x7, x8, [x17]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #16]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #64]
+        stp     x14, x0, [sp, #80]
+        ldp     x3, x4, [sp]
+        ldp     x7, x8, [sp, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #16]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #32]
+        stp     x14, x0, [sp, #48]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [x16]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [x16, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #160]
+        stp     x7, x8, [sp, #176]
+        ldp     x5, x6, [sp, #32]
+        ldp     x4, x3, [x16, #32]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #48]
+        ldp     x4, x3, [x16, #48]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #32]
+        stp     x7, x8, [sp, #48]
+        ldp     x2, x3, [sp, #160]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #176]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        mov     x2, #0xffffffffffffffff
+        csel    x2, xzr, x2, cc
+        mov     x3, #0xffffffff
+        csel    x3, xzr, x3, cc
+        mov     x5, #0xffffffff00000001
+        csel    x5, xzr, x5, cc
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, xzr
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp, #96]
+        stp     x10, x11, [sp, #112]
+        ldp     x2, x3, [sp, #32]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #48]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, cs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        adds    x9, x9, x8, lsl #32
+        lsr     x3, x8, #32
+        adcs    x10, x10, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x8, x3
+        umulh   x8, x8, x3
+        adcs    x11, x11, x2
+        adc     x8, x8, xzr
+        adds    x10, x10, x9, lsl #32
+        lsr     x3, x9, #32
+        adcs    x11, x11, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x9, x3
+        umulh   x9, x9, x3
+        adcs    x8, x8, x2
+        adc     x9, x9, xzr
+        adds    x11, x11, x10, lsl #32
+        lsr     x3, x10, #32
+        adcs    x8, x8, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x10, x3
+        umulh   x10, x10, x3
+        adcs    x9, x9, x2
+        adc     x10, x10, xzr
+        adds    x8, x8, x11, lsl #32
+        lsr     x3, x11, #32
+        adcs    x9, x9, x3
+        mov     x3, #0xffffffff00000001
+        mul     x2, x11, x3
+        umulh   x11, x11, x3
+        adcs    x10, x10, x2
+        adc     x11, x11, xzr
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, cs
+        mov     x3, #0xffffffff
+        mov     x5, #0xffffffff00000001
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        sbcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, cc
+        csel    x9, x9, x13, cc
+        csel    x10, x10, x14, cc
+        csel    x11, x11, x7, cc
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #16]
+        ldp     x3, x4, [sp, #96]
+        ldp     x7, x8, [x16]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #16]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #112]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #128]
+        stp     x14, x0, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x7, x8, [sp, #64]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #80]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #112]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #64]
+        stp     x14, x0, [sp, #80]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #64]
+        ldp     x4, x3, [sp, #128]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #80]
+        ldp     x4, x3, [sp, #144]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #96]
+        stp     x7, x8, [sp, #112]
+        ldp     x3, x4, [sp, #160]
+        ldp     x7, x8, [x16, #64]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #80]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #176]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #160]
+        stp     x14, x0, [sp, #176]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #64]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #16]
+        ldp     x4, x3, [sp, #80]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #16]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x3, x4, [sp, #96]
+        ldp     x7, x8, [x16, #32]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #48]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #112]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #96]
+        stp     x14, x0, [sp, #112]
+        ldp     x3, x4, [sp, #32]
+        ldp     x7, x8, [sp, #128]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #144]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #48]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        mov     x10, #0xffffffff00000001
+        adds    x13, x13, x12, lsl #32
+        lsr     x11, x12, #32
+        adcs    x14, x14, x11
+        mul     x11, x12, x10
+        umulh   x12, x12, x10
+        adcs    x0, x0, x11
+        adc     x12, x12, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        adds    x14, x14, x13, lsl #32
+        lsr     x11, x13, #32
+        adcs    x0, x0, x11
+        mul     x11, x13, x10
+        umulh   x13, x13, x10
+        adcs    x12, x12, x11
+        adc     x13, x13, xzr
+        adds    x0, x0, x14, lsl #32
+        lsr     x11, x14, #32
+        adcs    x12, x12, x11
+        mul     x11, x14, x10
+        umulh   x14, x14, x10
+        adcs    x13, x13, x11
+        adc     x14, x14, xzr
+        adds    x12, x12, x0, lsl #32
+        lsr     x11, x0, #32
+        adcs    x13, x13, x11
+        mul     x11, x0, x10
+        umulh   x0, x0, x10
+        adcs    x14, x14, x11
+        adc     x0, x0, xzr
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, cs
+        mov     x11, #0xffffffff
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        sbcs    x4, x14, xzr
+        sbcs    x5, x0, x10
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, cc
+        csel    x13, x13, x3, cc
+        csel    x14, x14, x4, cc
+        csel    x0, x0, x5, cc
+        stp     x12, x13, [sp, #128]
+        stp     x14, x0, [sp, #144]
+        ldp     x5, x6, [sp, #128]
+        ldp     x4, x3, [sp, #96]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #144]
+        ldp     x4, x3, [sp, #112]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        mov     x4, #0xffffffff00000001
+        and     x4, x4, x3
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #128]
+        stp     x7, x8, [sp, #144]
+        ldp     x0, x1, [x16, #64]
+        ldp     x2, x3, [x16, #80]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+        ldp     x0, x1, [sp]
+        ldp     x12, x13, [x17]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [sp, #16]
+        ldp     x12, x13, [x17, #16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+        ldp     x4, x5, [sp, #128]
+        ldp     x12, x13, [x17, #32]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [sp, #144]
+        ldp     x12, x13, [x17, #48]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+        ldp     x8, x9, [sp, #160]
+        mov     x12, #0x1
+        mov     x13, #0xffffffff00000000
+        csel    x8, x8, x12, ne
+        csel    x9, x9, x13, ne
+        ldp     x10, x11, [sp, #176]
+        mov     x12, #0xffffffffffffffff
+        mov     x13, #0xfffffffe
+        csel    x10, x10, x12, ne
+        csel    x11, x11, x13, ne
+        stp     x0, x1, [x15]
+        stp     x2, x3, [x15, #16]
+        stp     x4, x5, [x15, #32]
+        stp     x6, x7, [x15, #48]
+        stp     x8, x9, [x15, #64]
+        stp     x10, x11, [x15, #80]
+        add     sp, sp, #0xc0
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/README.md b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/README.md
new file mode 100644
index 00000000000..fa63f949fd1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/README.md
@@ -0,0 +1,9 @@
+This directory contains Arm implementations that are functional but slower
+than the implementations with the same file names. The implementations in the
+parent directory are mechanically/manually optimized versions of this
+directory, meaning that their high-level algorithms are unchanged but the
+implementation details are updated.
+
+These functions will only be compiled when running HOL Light proofs using
+`make proofs` because HOL Light proofs use these to keep proofs simple.
+The compiled object files will not be included in libs2nbignum.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montmul_p256_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montmul_p256_base.S
new file mode 100644
index 00000000000..b4dd6087fdd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montmul_p256_base.S
@@ -0,0 +1,275 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_256
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_p256_base
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in
+// the "usual" case x < p_256 and y < p_256).
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256_base)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z)
+// c,h,l,t should all be different
+// t,h should not overlap w,z
+// ---------------------------------------------------------------------------
+
+#define muldiffn(c,h,l, t, x,y, w,z)    \
+        subs    t, x, y __LF               \
+        cneg    t, t, cc __LF              \
+        csetm   c, cc __LF                 \
+        subs    h, w, z __LF               \
+        cneg    h, h, cc __LF              \
+        mul     l, t, h __LF               \
+        umulh   h, t, h __LF               \
+        cinv    c, c, cc __LF              \
+        eor     l, l, c __LF               \
+        eor     h, h, c
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1] and generating d4 from zero, re-using
+// d0 as a temporary internally together with t0, t1 and t2.
+// It is fine for d4 to be the same register as d0, and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t2,t1,t0)                                  \
+/* Let w = d0, the original word we use as offset; d0 gets recycled      */ \
+/* First let [t2;t1] = 2^32 * w                                          */ \
+/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0)           */ \
+        lsl     t1, d0, #32 __LF                                       \
+        subs    t0, d0, t1 __LF                                        \
+        lsr     t2, d0, #32 __LF                                       \
+        sbc     d0, d0, t2 __LF                                        \
+/* Hence [d4;..;d1] := [d3;d2;d1;0] + (2^256 - 2^224 + 2^192 + 2^96) * w */ \
+        adds    d1, d1, t1 __LF                                        \
+        adcs    d2, d2, t2 __LF                                        \
+        adcs    d3, d3, t0 __LF                                        \
+        adc     d4, d0, xzr
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define s0 x11
+#define s1 x12
+#define s2 x13
+#define s3 x14
+#define t0 x15
+#define t1 x16
+#define t2 x17
+#define t3 x1
+#define s4 x2
+
+S2N_BN_SYMBOL(bignum_montmul_p256_base):
+
+// Load in all words of both inputs
+
+        ldp     a0, a1, [x1]
+        ldp     a2, a3, [x1, #16]
+        ldp     b0, b1, [x2]
+        ldp     b2, b3, [x2, #16]
+
+// Multiply low halves with a 2x2->4 ADK multiplier as L = [s3;s2;s1;s0]
+
+        mul     s0, a0, b0
+        mul     s2, a1, b1
+        umulh   s1, a0, b0
+        adds    t1, s0, s2
+        umulh   s3, a1, b1
+        adcs    t2, s1, s3
+        adcs    s3, s3, xzr
+        adds    s1, s1, t1
+        adcs    s2, s2, t2
+        adcs    s3, s3, xzr
+        muldiffn(t3,t2,t1, t0, a0,a1, b1,b0)
+        adds    xzr, t3, #1
+        adcs    s1, s1, t1
+        adcs    s2, s2, t2
+        adc     s3, s3, t3
+
+// Perform two "short" Montgomery steps on the low product to
+// get a modified low result L' = [s1;s0;s3;s2]
+// This shifts it to an offset compatible with middle terms
+// Stash the result L' temporarily in the output buffer to avoid
+// using additional registers.
+
+        montreds(s0,s3,s2,s1,s0, t1,t2,t3)
+        montreds(s1,s0,s3,s2,s1, t1,t2,t3)
+
+        stp     s2, s3, [x0]
+        stp     s0, s1, [x0, #16]
+
+// Multiply high halves with a 2x2->4 ADK multiplier as H = [s3;s2;s1;s0]
+
+        mul     s0, a2, b2
+        mul     s2, a3, b3
+        umulh   s1, a2, b2
+        adds    t1, s0, s2
+        umulh   s3, a3, b3
+        adcs    t2, s1, s3
+        adcs    s3, s3, xzr
+        adds    s1, s1, t1
+        adcs    s2, s2, t2
+        adcs    s3, s3, xzr
+        muldiffn(t3,t2,t1, t0, a2,a3, b3,b2)
+        adds    xzr, t3, #1
+        adcs    s1, s1, t1
+        adcs    s2, s2, t2
+        adc     s3, s3, t3
+
+// Compute sign-magnitude a2,[a1,a0] = x_hi - x_lo
+
+        subs    a0, a2, a0
+        sbcs    a1, a3, a1
+        sbc     a2, xzr, xzr
+        adds    xzr, a2, #1
+        eor     a0, a0, a2
+        adcs    a0, a0, xzr
+        eor     a1, a1, a2
+        adcs    a1, a1, xzr
+
+// Compute sign-magnitude b2,[b1,b0] = y_lo - y_hi
+
+        subs    b0, b0, b2
+        sbcs    b1, b1, b3
+        sbc     b2, xzr, xzr
+        adds    xzr, b2, #1
+        eor     b0, b0, b2
+        adcs    b0, b0, xzr
+        eor     b1, b1, b2
+        adcs    b1, b1, xzr
+
+// Save the correct sign for the sub-product in b3
+
+        eor     b3, a2, b2
+
+// Add the high H to the modified low term L' as H + L' = [s4;b2;a2;t3;t0]
+
+        ldp     t0, t3, [x0]
+        adds    t0, s0, t0
+        adcs    t3, s1, t3
+        ldp     a2, b2, [x0, #16]
+        adcs    a2, s2, a2
+        adcs    b2, s3, b2
+        adc     s4, xzr, xzr
+
+// Multiply with yet a third 2x2->4 ADK multiplier for complex mid-term M
+
+        mul     s0, a0, b0
+        mul     s2, a1, b1
+        umulh   s1, a0, b0
+        adds    t1, s0, s2
+        umulh   s3, a1, b1
+        adcs    t2, s1, s3
+        adcs    s3, s3, xzr
+        adds    s1, s1, t1
+        adcs    s2, s2, t2
+        adcs    s3, s3, xzr
+        muldiffn(a1,t2,t1, a0, a0,a1, b1,b0)
+        adds    xzr, a1, #1
+        adcs    s1, s1, t1
+        adcs    s2, s2, t2
+        adc     s3, s3, a1
+
+// Set up a sign-modified version of the mid-product in a long accumulator
+// as [b3;a1;a0;s3;s2;s1;s0], adding in the H + L' term once with
+// zero offset as this signed value is created
+
+        adds    xzr, b3, #1
+        eor     s0, s0, b3
+        adcs    s0, s0, t0
+        eor     s1, s1, b3
+        adcs    s1, s1, t3
+        eor     s2, s2, b3
+        adcs    s2, s2, a2
+        eor     s3, s3, b3
+        adcs    s3, s3, b2
+        adcs    a0, s4, b3
+        adcs    a1, b3, xzr
+        adc     b3, b3, xzr
+
+// Add in the stashed H + L' term an offset of 2 words as well
+
+        adds    s2, s2, t0
+        adcs    s3, s3, t3
+        adcs    a0, a0, a2
+        adcs    a1, a1, b2
+        adc     b3, b3, s4
+
+// Do two more Montgomery steps on the composed term
+// Net pre-reduct is in [b3;a1;a0;s3;s2]
+
+        montreds(s0,s3,s2,s1,s0, t1,t2,t3)
+        montreds(s1,s0,s3,s2,s1, t1,t2,t3)
+
+        adds    a0, a0, s0
+        adcs    a1, a1, s1
+        adc     b3, b3, xzr
+
+// Because of the way we added L' in two places, we can overspill by
+// more than usual in Montgomery, with the result being only known to
+// be < 3 * p_256, not the usual < 2 * p_256. So now we do a more
+// elaborate final correction in the style of bignum_cmul_p256, though
+// we can use much simpler quotient estimation logic (q = h + 1) and
+// slightly more direct accumulation of p_256 * q.
+
+#define d0 s2
+#define d1 s3
+#define d2 a0
+#define d3 a1
+#define h b3
+
+#define q s4
+#define c b0
+
+        add     q, h, #1
+        lsl     t1, q, #32
+
+        adds    d3, d3, t1
+        adc     h, h, xzr
+        sub     t0, xzr, q
+        sub     t1, t1, #1
+        subs    d0, d0, t0
+        sbcs    d1, d1, t1
+        sbcs    d2, d2, xzr
+        sbcs    d3, d3, q
+        sbcs    c, h, q
+        adds    d0, d0, c
+        mov     h, #0x00000000ffffffff
+        and     h, h, c
+        adcs    d1, d1, h
+        adcs    d2, d2, xzr
+        mov     h, #0xffffffff00000001
+        and     h, h, c
+        adc     d3, d3, h
+
+// Finally store the result
+
+        stp     d0, d1, [x0]
+        stp     d2, d3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montsqr_p256_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montsqr_p256_base.S
new file mode 100644
index 00000000000..d6d0a9ebde9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montsqr_p256_base.S
@@ -0,0 +1,266 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_p256_base
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is
+// guaranteed in particular if x < p_256 initially (the "intended" case).
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256_base)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z)
+// c,h,l,t should all be different
+// t,h should not overlap w,z
+// ---------------------------------------------------------------------------
+
+#define muldiffn(c,h,l, t, x,y, w,z)    \
+        subs    t, x, y __LF               \
+        cneg    t, t, cc __LF              \
+        csetm   c, cc __LF                 \
+        subs    h, w, z __LF               \
+        cneg    h, h, cc __LF              \
+        mul     l, t, h __LF               \
+        umulh   h, t, h __LF               \
+        cinv    c, c, cc __LF              \
+        eor     l, l, c __LF               \
+        eor     h, h, c
+
+// ---------------------------------------------------------------------------
+// Core one-step "end" Montgomery reduction macro. Takes input in
+// [d5;d4;d3;d2;d1;d0] and returns result in [d5;d4;d3;d2;d1], adding to
+// the existing [d4;d3;d2;d1], re-using d0 as a temporary internally as well
+// as t1, t2, t3, and initializing d5 from zero (hence "end").
+// ---------------------------------------------------------------------------
+
+#define montrede(d5, d4,d3,d2,d1,d0, t2,t1,t0)                              \
+/* Let w = d0, the original word we use as offset; d0 gets recycled */      \
+/* First let [t2;t1] = 2^32 * w                                     */      \
+/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0)      */      \
+        lsl     t1, d0, #32 __LF                                       \
+        subs    t0, d0, t1 __LF                                        \
+        lsr     t2, d0, #32 __LF                                       \
+        sbc     d0, d0, t2 __LF                                        \
+/* Hence basic [d4;d3;d2;d1] += (2^256 - 2^224 + 2^192 + 2^96) * w  */      \
+        adds    d1, d1, t1 __LF                                        \
+        adcs    d2, d2, t2 __LF                                        \
+        adcs    d3, d3, t0 __LF                                        \
+        adcs    d4, d4, d0 __LF                                        \
+        adc     d5, xzr, xzr
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1] and generating d4 from zero, re-using
+// d0 as a temporary internally together with t0, t1 and t2.
+// It is fine for d4 to be the same register as d0, and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t2,t1,t0)                                  \
+/* Let w = d0, the original word we use as offset; d0 gets recycled      */ \
+/* First let [t2;t1] = 2^32 * w                                          */ \
+/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0)           */ \
+        lsl     t1, d0, #32 __LF                                       \
+        subs    t0, d0, t1 __LF                                        \
+        lsr     t2, d0, #32 __LF                                       \
+        sbc     d0, d0, t2 __LF                                        \
+/* Hence [d4;..;d1] := [d3;d2;d1;0] + (2^256 - 2^224 + 2^192 + 2^96) * w */ \
+        adds    d1, d1, t1 __LF                                        \
+        adcs    d2, d2, t2 __LF                                        \
+        adcs    d3, d3, t0 __LF                                        \
+        adc     d4, d0, xzr
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+
+#define c0 x6
+#define c1 x7
+#define c2 x8
+#define c3 x9
+#define c4 x10
+#define d1 x11
+#define d2 x12
+#define d3 x13
+#define d4 x14
+
+#define s0 x15
+#define s1 x16
+#define s2 x17
+#define s3 x1
+
+#define a0short w2
+#define a1short w3
+#define d1short w11
+
+S2N_BN_SYMBOL(bignum_montsqr_p256_base):
+
+// Load in all words of the input
+
+        ldp     a0, a1, [x1]
+        ldp     a2, a3, [x1, #16]
+
+// Square the low half, getting a result in [s3;s2;s1;s0]
+// This uses 32x32->64 multiplications to reduce the number of UMULHs
+
+        umull   s0, a0short, a0short
+        lsr     d1, a0, #32
+        umull   s1, d1short, d1short
+        umull   d1, a0short, d1short
+        adds    s0, s0, d1, lsl #33
+        lsr     d1, d1, #31
+        adc     s1, s1, d1
+        umull   s2, a1short, a1short
+        lsr     d1, a1, #32
+        umull   s3, d1short, d1short
+        umull   d1, a1short, d1short
+        mul     d2, a0, a1
+        umulh   d3, a0, a1
+        adds    s2, s2, d1, lsl #33
+        lsr     d1, d1, #31
+        adc     s3, s3, d1
+        adds    d2, d2, d2
+        adcs    d3, d3, d3
+        adc     s3, s3, xzr
+        adds    s1, s1, d2
+        adcs    s2, s2, d3
+        adc     s3, s3, xzr
+
+// Perform two "short" Montgomery steps on the low square
+// This shifts it to an offset compatible with middle product
+
+        montreds(s0,s3,s2,s1,s0, d1,d2,d3)
+
+        montreds(s1,s0,s3,s2,s1, d1,d2,d3)
+
+// Compute cross-product with ADK 2x2->4 multiplier as [c3;c2;c1;c0]
+
+        mul     c0, a0, a2
+        mul     d4, a1, a3
+        umulh   c2, a0, a2
+        muldiffn(d3,d2,d1, c4, a0,a1, a3,a2)
+
+        adds    c1, c0, c2
+        adc     c2, c2, xzr
+
+        umulh   c3, a1, a3
+
+        adds    c1, c1, d4
+        adcs    c2, c2, c3
+        adc     c3, c3, xzr
+        adds    c2, c2, d4
+        adc     c3, c3, xzr
+
+        adds    xzr, d3, #1
+        adcs    c1, c1, d1
+        adcs    c2, c2, d2
+        adc     c3, c3, d3
+
+// Double it and add the Montgomerified low square
+
+        adds    c0, c0, c0
+        adcs    c1, c1, c1
+        adcs    c2, c2, c2
+        adcs    c3, c3, c3
+        adc     c4, xzr, xzr
+
+        adds    c0, c0, s2
+        adcs    c1, c1, s3
+        adcs    c2, c2, s0
+        adcs    c3, c3, s1
+        adc     c4, c4, xzr
+
+// Montgomery-reduce the combined low and middle term another twice
+
+        montrede(c0,c4,c3,c2,c1,c0, d1,d2,d3)
+
+        montrede(c1,c0,c4,c3,c2,c1, d1,d2,d3)
+
+// Our sum so far is in [c1,c0,c4,c3,c2]; choose more intuitive names
+
+#define r0 x8
+#define r1 x9
+#define r2 x10
+#define r3 x6
+#define c x7
+
+// So we can have these as temps
+
+#define t1 x11
+#define t2 x12
+#define t3 x13
+
+// Add in the pure squares 22 + 33
+
+        mul     t1, a2, a2
+        adds    r0, r0, t1
+        mul     t2, a3, a3
+        umulh   t1, a2, a2
+        adcs    r1, r1, t1
+        adcs    r2, r2, t2
+        umulh   t2, a3, a3
+        adcs    r3, r3, t2
+        adc     c, c, xzr
+
+// Construct the 23 term, double and add it in
+
+        mul     t1, a2, a3
+        umulh   t2, a2, a3
+        adds    t1, t1, t1
+        adcs    t2, t2, t2
+        adc     t3, xzr, xzr
+
+        adds    r1, r1, t1
+        adcs    r2, r2, t2
+        adcs    r3, r3, t3
+        adcs    c, c, xzr
+
+// We know, writing B = 2^{4*64} that the full implicit result is
+// B^2 c <= z + (B - 1) * p < B * p + (B - 1) * p < 2 * B * p,
+// so the top half is certainly < 2 * p. If c = 1 already, we know
+// subtracting p will give the reduced modulus. But now we do a
+// subtraction-comparison to catch cases where the residue is >= p.
+// The constants are such that [t3;0;t1;-1] = p_256.
+
+#define t0      x5
+
+// Set CF (because of inversion) iff (0,p_256) <= (c,r3,r2,r1,r0)
+
+        mov     t1, #0x00000000ffffffff
+        subs    t0, r0, #-1
+        sbcs    t1, r1, t1
+        mov     t3, #0xffffffff00000001
+        sbcs    t2, r2, xzr
+        sbcs    t3, r3, t3
+        sbcs    xzr, c, xzr
+
+// Select final output accordingly
+
+        csel    r0, t0, r0, cs
+        csel    r1, t1, r1, cs
+        csel    r2, t2, r2, cs
+        csel    r3, t3, r3, cs
+
+// Store things back in place
+
+        stp     r0, r1, [x0]
+        stp     r2, r3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjadd.S
new file mode 100644
index 00000000000..053c8bac8d4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjadd.S
@@ -0,0 +1,612 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x21
+#define input_x x22
+#define input_y x23
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+#define z_2 input_y, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define x1a sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define z2sq sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define y1a sp, #(NUMSIZE*6)
+
+#define NSPACE (NUMSIZE*7)
+
+
+#define montmul_p256(P0,P1,P2) \
+        add x0, P0;\
+        add x1, P1;\
+        add x2, P2;\
+        bl .montmul_p256
+
+#define montsqr_p256(P0,P1) \
+        add x0, P0;\
+        add x1, P1;\
+        bl .montsqr_p256
+
+#define sub_p256(P0,P1,P2) \
+        add x0, P0;\
+        add x1, P1;\
+        add x2, P2;\
+        bl .sub_p256
+
+
+// Corresponds exactly to bignum_montmul_p256
+
+.montmul_p256:
+        ldr     q20, [x2]
+        ldp     x7, x17, [x1]
+        ldr     q0, [x1]
+        ldp     x6, x10, [x2]
+        ldp     x11, x15, [x1, #16]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc
+        cneg    x13, x4, cc
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [x2, #16]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc
+        cinv    x9, x3, cc
+        cmn     x17, #0x1
+        ldr     q28, [x1, #16]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [x2, #16]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc
+        cneg    x6, x6, cc
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x9, x3, x13
+        adcs    x3, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x12, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x15, x15, xzr
+        rev64   v24.4s, v20.4s
+        stp     x12, x15, [x0, #16]
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        stp     x9, x3, [x0]
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        ldp     x15, x8, [x0]
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        ldp     x9, x13, [x0, #16]
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x15
+        adcs    x15, x16, x8
+        eor     x5, x17, x4
+        adcs    x9, x1, x9
+        eor     x1, x10, x5
+        adcs    x16, x2, x13
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001
+        adds    x11, x11, x13
+        and     x1, x1, x13
+        adcs    x4, x4, x1
+        and     x1, x12, x13
+        stp     x11, x4, [x0]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [x0, #16]
+        ret
+
+// Corresponds exactly to bignum_montsqr_p256
+
+.montsqr_p256:
+        ldr     q19, [x1]
+        ldp     x9, x13, [x1]
+        ldr     q23, [x1, #16]
+        ldr     q0, [x1]
+        ldp     x1, x10, [x1, #16]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x9, x13
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x9, x13
+        umulh   x15, x9, x1
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x9, x13
+        mov     x12, v1.d[0]
+        csetm   x5, cc
+        cneg    x6, x14, cc
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x10, x1
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc
+        cinv    x2, x5, cc
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x13, x10
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x1, x10
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x1, x10
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x10, x10
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x10, x10
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x1, x1
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x1, x1
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x16, x3, x16, cs
+        csel    x14, x8, x14, cs
+        csel    x12, x11, x12, cs
+        csel    x2, x5, x2, cs
+        stp     x14, x12, [x0, #16]
+        stp     x16, x2, [x0]
+        ret
+
+// Corresponds exactly to bignum_sub_p256
+
+.sub_p256:
+        ldp     x5, x6, [x1]
+        ldp     x4, x3, [x2]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x1, #16]
+        ldp     x4, x3, [x2, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x5, x6, [x0]
+        stp     x7, x8, [x0, #16]
+        ret
+
+
+S2N_BN_SYMBOL(p256_montjadd):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x30, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 12 * multiply + 4 * square + 7 * subtract
+
+        montsqr_p256(z1sq,z_1)
+        montsqr_p256(z2sq,z_2)
+
+        montmul_p256(y1a,z_2,y_1)
+        montmul_p256(y2a,z_1,y_2)
+
+        montmul_p256(x2a,z1sq,x_2)
+        montmul_p256(x1a,z2sq,x_1)
+        montmul_p256(y2a,z1sq,y2a)
+        montmul_p256(y1a,z2sq,y1a)
+
+        sub_p256(xd,x2a,x1a)
+        sub_p256(yd,y2a,y1a)
+
+        montsqr_p256(zz,xd)
+        montsqr_p256(ww,yd)
+
+        montmul_p256(zzx1,zz,x1a)
+        montmul_p256(zzx2,zz,x2a)
+
+        sub_p256(resx,ww,zzx1)
+        sub_p256(t1,zzx2,zzx1)
+
+        montmul_p256(xd,xd,z_1)
+
+        sub_p256(resx,resx,zzx2)
+
+        sub_p256(t2,zzx1,resx)
+
+        montmul_p256(t1,t1,y1a)
+        montmul_p256(resz,xd,z_2)
+        montmul_p256(t2,yd,t2)
+
+        sub_p256(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
+// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne
+
+        ldp     x4, x5, [z_2]
+        ldp     x6, x7, [z_2+16]
+
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne
+
+        cmp     x13, x12
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        ldp     x8, x9, [resz]
+        csel    x8, x0, x8, lo
+        csel    x9, x1, x9, lo
+        csel    x8, x4, x8, hi
+        csel    x9, x5, x9, hi
+        ldp     x10, x11, [resz+16]
+        csel    x10, x2, x10, lo
+        csel    x11, x3, x11, lo
+        csel    x10, x6, x10, hi
+        csel    x11, x7, x11, hi
+
+        ldp     x12, x13, [x_1]
+        ldp     x0, x1, [resx]
+        csel    x0, x12, x0, lo
+        csel    x1, x13, x1, lo
+        ldp     x12, x13, [x_2]
+        csel    x0, x12, x0, hi
+        csel    x1, x13, x1, hi
+
+        ldp     x12, x13, [x_1+16]
+        ldp     x2, x3, [resx+16]
+        csel    x2, x12, x2, lo
+        csel    x3, x13, x3, lo
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x12, x2, hi
+        csel    x3, x13, x3, hi
+
+        ldp     x12, x13, [y_1]
+        ldp     x4, x5, [resy]
+        csel    x4, x12, x4, lo
+        csel    x5, x13, x5, lo
+        ldp     x12, x13, [y_2]
+        csel    x4, x12, x4, hi
+        csel    x5, x13, x5, hi
+
+        ldp     x12, x13, [y_1+16]
+        ldp     x6, x7, [resy+16]
+        csel    x6, x12, x6, lo
+        csel    x7, x13, x7, lo
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x12, x6, hi
+        csel    x7, x13, x7, hi
+
+// Finally store back the multiplexed values
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x23, x30, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjdouble.S
new file mode 100644
index 00000000000..befe861db25
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjdouble.S
@@ -0,0 +1,748 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjdouble
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x19
+#define input_x x20
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z2 sp, #(NUMSIZE*0)
+#define y4 sp, #(NUMSIZE*0)
+
+#define y2 sp, #(NUMSIZE*1)
+
+#define t1 sp, #(NUMSIZE*2)
+
+#define t2 sp, #(NUMSIZE*3)
+#define x2p sp, #(NUMSIZE*3)
+#define dx2 sp, #(NUMSIZE*3)
+
+#define xy2 sp, #(NUMSIZE*4)
+
+#define x4p sp, #(NUMSIZE*5)
+#define d_ sp, #(NUMSIZE*5)
+
+#define NSPACE #(NUMSIZE*6)
+
+// Corresponds exactly to bignum_montmul_p256
+
+.montmul_p256:
+        ldr     q20, [x2]
+        ldp     x7, x17, [x1]
+        ldr     q0, [x1]
+        ldp     x6, x10, [x2]
+        ldp     x11, x15, [x1, #16]
+        rev64   v16.4s, v20.4s
+        subs    x4, x7, x17
+        csetm   x3, cc
+        cneg    x13, x4, cc
+        mul     v16.4s, v16.4s, v0.4s
+        umulh   x12, x17, x10
+        uzp1    v28.4s, v20.4s, v0.4s
+        subs    x14, x11, x7
+        ldr     q20, [x2, #16]
+        sbcs    x5, x15, x17
+        ngc     x17, xzr
+        subs    x8, x11, x15
+        uaddlp  v27.2d, v16.4s
+        umulh   x4, x7, x6
+        uzp1    v21.4s, v0.4s, v0.4s
+        cneg    x11, x8, cc
+        shl     v17.2d, v27.2d, #32
+        csetm   x15, cc
+        subs    x9, x10, x6
+        eor     x7, x14, x17
+        umlal   v17.2d, v21.2s, v28.2s
+        cneg    x8, x9, cc
+        cinv    x9, x3, cc
+        cmn     x17, #0x1
+        ldr     q28, [x1, #16]
+        adcs    x14, x7, xzr
+        mul     x7, x13, x8
+        eor     x1, x5, x17
+        adcs    x5, x1, xzr
+        xtn     v1.2s, v20.2d
+        mov     x1, v17.d[0]
+        mov     x3, v17.d[1]
+        uzp2    v16.4s, v20.4s, v20.4s
+        umulh   x16, x13, x8
+        eor     x13, x7, x9
+        adds    x8, x1, x3
+        adcs    x7, x4, x12
+        xtn     v0.2s, v28.2d
+        adcs    x12, x12, xzr
+        adds    x8, x4, x8
+        adcs    x3, x3, x7
+        ldp     x7, x2, [x2, #16]
+        adcs    x12, x12, xzr
+        cmn     x9, #0x1
+        adcs    x8, x8, x13
+        eor     x13, x16, x9
+        adcs    x16, x3, x13
+        lsl     x3, x1, #32
+        adc     x13, x12, x9
+        subs    x12, x6, x7
+        sbcs    x9, x10, x2
+        lsr     x10, x1, #32
+        ngc     x4, xzr
+        subs    x6, x2, x7
+        cinv    x2, x15, cc
+        cneg    x6, x6, cc
+        subs    x7, x1, x3
+        eor     x9, x9, x4
+        sbc     x1, x1, x10
+        adds    x15, x8, x3
+        adcs    x3, x16, x10
+        mul     x16, x11, x6
+        adcs    x8, x13, x7
+        eor     x13, x12, x4
+        adc     x10, x1, xzr
+        cmn     x4, #0x1
+        umulh   x6, x11, x6
+        adcs    x11, x13, xzr
+        adcs    x1, x9, xzr
+        lsl     x13, x15, #32
+        subs    x12, x15, x13
+        lsr     x7, x15, #32
+        sbc     x15, x15, x7
+        adds    x9, x3, x13
+        adcs    x3, x8, x7
+        umulh   x8, x14, x11
+        umull   v21.2d, v0.2s, v1.2s
+        adcs    x12, x10, x12
+        umull   v3.2d, v0.2s, v16.2s
+        adc     x15, x15, xzr
+        rev64   v24.4s, v20.4s
+        stp     x12, x15, [x0, #16]
+        movi    v2.2d, #0xffffffff
+        mul     x10, x14, x11
+        mul     v4.4s, v24.4s, v28.4s
+        subs    x13, x14, x5
+        uzp2    v19.4s, v28.4s, v28.4s
+        csetm   x15, cc
+        usra    v3.2d, v21.2d, #32
+        mul     x7, x5, x1
+        umull   v21.2d, v19.2s, v16.2s
+        cneg    x13, x13, cc
+        uaddlp  v5.2d, v4.4s
+        subs    x11, x1, x11
+        and     v16.16b, v3.16b, v2.16b
+        umulh   x5, x5, x1
+        shl     v24.2d, v5.2d, #32
+        cneg    x11, x11, cc
+        umlal   v16.2d, v19.2s, v1.2s
+        cinv    x12, x15, cc
+        umlal   v24.2d, v0.2s, v1.2s
+        adds    x15, x10, x7
+        mul     x14, x13, x11
+        eor     x1, x6, x2
+        adcs    x6, x8, x5
+        stp     x9, x3, [x0]
+        usra    v21.2d, v3.2d, #32
+        adcs    x9, x5, xzr
+        umulh   x11, x13, x11
+        adds    x15, x8, x15
+        adcs    x7, x7, x6
+        eor     x8, x14, x12
+        usra    v21.2d, v16.2d, #32
+        adcs    x13, x9, xzr
+        cmn     x12, #0x1
+        mov     x9, v24.d[1]
+        adcs    x14, x15, x8
+        eor     x6, x11, x12
+        adcs    x6, x7, x6
+        mov     x5, v24.d[0]
+        mov     x11, v21.d[1]
+        mov     x7, v21.d[0]
+        adc     x3, x13, x12
+        adds    x12, x5, x9
+        adcs    x13, x7, x11
+        ldp     x15, x8, [x0]
+        adcs    x11, x11, xzr
+        adds    x12, x7, x12
+        eor     x16, x16, x2
+        adcs    x7, x9, x13
+        adcs    x11, x11, xzr
+        cmn     x2, #0x1
+        ldp     x9, x13, [x0, #16]
+        adcs    x16, x12, x16
+        adcs    x1, x7, x1
+        adc     x2, x11, x2
+        adds    x7, x5, x15
+        adcs    x15, x16, x8
+        eor     x5, x17, x4
+        adcs    x9, x1, x9
+        eor     x1, x10, x5
+        adcs    x16, x2, x13
+        adc     x2, xzr, xzr
+        cmn     x5, #0x1
+        eor     x13, x14, x5
+        adcs    x14, x1, x7
+        eor     x1, x6, x5
+        adcs    x6, x13, x15
+        adcs    x10, x1, x9
+        eor     x4, x3, x5
+        mov     x1, #0xffffffff
+        adcs    x8, x4, x16
+        lsr     x13, x14, #32
+        adcs    x17, x2, x5
+        adcs    x11, x5, xzr
+        adc     x4, x5, xzr
+        adds    x12, x10, x7
+        adcs    x7, x8, x15
+        adcs    x5, x17, x9
+        adcs    x9, x11, x16
+        lsl     x11, x14, #32
+        adc     x10, x4, x2
+        subs    x17, x14, x11
+        sbc     x4, x14, x13
+        adds    x11, x6, x11
+        adcs    x12, x12, x13
+        lsl     x15, x11, #32
+        adcs    x17, x7, x17
+        lsr     x7, x11, #32
+        adc     x13, x4, xzr
+        subs    x4, x11, x15
+        sbc     x11, x11, x7
+        adds    x8, x12, x15
+        adcs    x15, x17, x7
+        adcs    x4, x13, x4
+        adc     x11, x11, xzr
+        adds    x7, x5, x4
+        adcs    x17, x9, x11
+        adc     x13, x10, xzr
+        add     x12, x13, #0x1
+        neg     x11, x12
+        lsl     x4, x12, #32
+        adds    x17, x17, x4
+        sub     x4, x4, #0x1
+        adc     x13, x13, xzr
+        subs    x11, x8, x11
+        sbcs    x4, x15, x4
+        sbcs    x7, x7, xzr
+        sbcs    x17, x17, x12
+        sbcs    x13, x13, x12
+        mov     x12, #0xffffffff00000001
+        adds    x11, x11, x13
+        and     x1, x1, x13
+        adcs    x4, x4, x1
+        and     x1, x12, x13
+        stp     x11, x4, [x0]
+        adcs    x4, x7, xzr
+        adc     x1, x17, x1
+        stp     x4, x1, [x0, #16]
+        ret
+
+// Corresponds exactly to bignum_montsqr_p256
+
+.montsqr_p256:
+        ldr     q19, [x1]
+        ldp     x9, x13, [x1]
+        ldr     q23, [x1, #16]
+        ldr     q0, [x1]
+        ldp     x1, x10, [x1, #16]
+        uzp2    v29.4s, v19.4s, v19.4s
+        xtn     v4.2s, v19.2d
+        umulh   x8, x9, x13
+        rev64   v20.4s, v23.4s
+        umull   v16.2d, v19.2s, v19.2s
+        umull   v1.2d, v29.2s, v4.2s
+        mul     v20.4s, v20.4s, v0.4s
+        subs    x14, x9, x13
+        umulh   x15, x9, x1
+        mov     x16, v16.d[1]
+        umull2  v4.2d, v19.4s, v19.4s
+        mov     x4, v16.d[0]
+        uzp1    v17.4s, v23.4s, v0.4s
+        uaddlp  v19.2d, v20.4s
+        lsr     x7, x8, #63
+        mul     x11, x9, x13
+        mov     x12, v1.d[0]
+        csetm   x5, cc
+        cneg    x6, x14, cc
+        mov     x3, v4.d[1]
+        mov     x14, v4.d[0]
+        subs    x2, x10, x1
+        mov     x9, v1.d[1]
+        cneg    x17, x2, cc
+        cinv    x2, x5, cc
+        adds    x5, x4, x12, lsl #33
+        extr    x4, x8, x11, #63
+        lsr     x8, x12, #31
+        uzp1    v20.4s, v0.4s, v0.4s
+        shl     v19.2d, v19.2d, #32
+        adc     x16, x16, x8
+        adds    x8, x14, x9, lsl #33
+        lsr     x14, x9, #31
+        lsl     x9, x5, #32
+        umlal   v19.2d, v20.2s, v17.2s
+        adc     x14, x3, x14
+        adds    x16, x16, x11, lsl #1
+        lsr     x3, x5, #32
+        umulh   x12, x6, x17
+        adcs    x4, x8, x4
+        adc     x11, x14, x7
+        subs    x8, x5, x9
+        sbc     x5, x5, x3
+        adds    x16, x16, x9
+        mov     x14, v19.d[0]
+        mul     x17, x6, x17
+        adcs    x3, x4, x3
+        lsl     x7, x16, #32
+        umulh   x13, x13, x10
+        adcs    x11, x11, x8
+        lsr     x8, x16, #32
+        adc     x5, x5, xzr
+        subs    x9, x16, x7
+        sbc     x16, x16, x8
+        adds    x7, x3, x7
+        mov     x3, v19.d[1]
+        adcs    x6, x11, x8
+        umulh   x11, x1, x10
+        adcs    x5, x5, x9
+        eor     x8, x12, x2
+        adc     x9, x16, xzr
+        adds    x16, x14, x15
+        adc     x15, x15, xzr
+        adds    x12, x16, x3
+        eor     x16, x17, x2
+        mul     x4, x1, x10
+        adcs    x15, x15, x13
+        adc     x17, x13, xzr
+        adds    x15, x15, x3
+        adc     x3, x17, xzr
+        cmn     x2, #0x1
+        mul     x17, x10, x10
+        adcs    x12, x12, x16
+        adcs    x16, x15, x8
+        umulh   x10, x10, x10
+        adc     x2, x3, x2
+        adds    x14, x14, x14
+        adcs    x12, x12, x12
+        adcs    x16, x16, x16
+        adcs    x2, x2, x2
+        adc     x15, xzr, xzr
+        adds    x14, x14, x7
+        mul     x3, x1, x1
+        adcs    x12, x12, x6
+        lsr     x7, x14, #32
+        adcs    x16, x16, x5
+        lsl     x5, x14, #32
+        umulh   x13, x1, x1
+        adcs    x2, x2, x9
+        mov     x6, #0xffffffff
+        adc     x15, x15, xzr
+        adds    x8, x4, x4
+        adcs    x1, x11, x11
+        mov     x11, #0xffffffff00000001
+        adc     x4, xzr, xzr
+        subs    x9, x14, x5
+        sbc     x14, x14, x7
+        adds    x12, x12, x5
+        adcs    x16, x16, x7
+        lsl     x5, x12, #32
+        lsr     x7, x12, #32
+        adcs    x2, x2, x9
+        adcs    x14, x15, x14
+        adc     x15, xzr, xzr
+        subs    x9, x12, x5
+        sbc     x12, x12, x7
+        adds    x16, x16, x5
+        adcs    x2, x2, x7
+        adcs    x14, x14, x9
+        adcs    x12, x15, x12
+        adc     x15, xzr, xzr
+        adds    x16, x16, x3
+        adcs    x2, x2, x13
+        adcs    x14, x14, x17
+        adcs    x12, x12, x10
+        adc     x15, x15, xzr
+        adds    x2, x2, x8
+        adcs    x14, x14, x1
+        adcs    x12, x12, x4
+        adcs    x15, x15, xzr
+        adds    x3, x16, #0x1
+        sbcs    x5, x2, x6
+        sbcs    x8, x14, xzr
+        sbcs    x11, x12, x11
+        sbcs    xzr, x15, xzr
+        csel    x16, x3, x16, cs
+        csel    x14, x8, x14, cs
+        csel    x12, x11, x12, cs
+        csel    x2, x5, x2, cs
+        stp     x14, x12, [x0, #16]
+        stp     x16, x2, [x0]
+        ret
+
+// Corresponds exactly to bignum_sub_p256
+
+.sub_p256:
+        ldp     x5, x6, [x1]
+        ldp     x4, x3, [x2]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x1, #16]
+        ldp     x4, x3, [x2, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, cc
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff
+        adcs    x6, x6, x4
+        adcs    x7, x7, xzr
+        and     x4, x3, #0xffffffff00000001
+        adc     x8, x8, x4
+        stp     x5, x6, [x0]
+        stp     x7, x8, [x0, #16]
+        ret
+
+// Corresponds exactly to bignum_add_p256
+
+.add_p256:
+        ldp     x4, x5, [x1]
+        ldp     x8, x9, [x2]
+        adds    x4, x4, x8
+        adcs    x5, x5, x9
+        ldp     x6, x7, [x1, #16]
+        ldp     x10, x11, [x2, #16]
+        adcs    x6, x6, x10
+        adcs    x7, x7, x11
+        adc     x3, xzr, xzr
+        adds    x8, x4, #0x1
+        mov     x9, #0xffffffff
+        sbcs    x9, x5, x9
+        sbcs    x10, x6, xzr
+        mov     x11, #0xffffffff00000001
+        sbcs    x11, x7, x11
+        sbcs    x3, x3, xzr
+        csel    x4, x4, x8, cc
+        csel    x5, x5, x9, cc
+        csel    x6, x6, x10, cc
+        csel    x7, x7, x11, cc
+        stp     x4, x5, [x0]
+        stp     x6, x7, [x0, #16]
+        ret
+
+
+#define montmul_p256(P0,P1,P2) \
+        add x0, P0;\
+        add x1, P1;\
+        add x2, P2;\
+        bl .montmul_p256
+
+#define montsqr_p256(P0,P1) \
+        add x0, P0;\
+        add x1, P1;\
+        bl .montsqr_p256
+
+#define sub_p256(P0,P1,P2) \
+        add x0, P0;\
+        add x1, P1;\
+        add x2, P2;\
+        bl .sub_p256
+
+#define add_p256(P0,P1,P2) \
+        add x0, P0;\
+        add x1, P1;\
+        add x2, P2;\
+        bl .add_p256
+
+
+// A weak version of add that only guarantees sum in 4 digits
+
+#define weakadd_p256(P0,P1,P2)                  \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        adds    x5, x5, x4 __LF                    \
+        adcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        csetm   x3, cs __LF                        \
+        subs    x5, x5, x3 __LF                    \
+        and     x1, x3, #4294967295 __LF           \
+        sbcs    x6, x6, x1 __LF                    \
+        sbcs    x7, x7, xzr __LF                   \
+        and     x2, x3, #-4294967295 __LF          \
+        sbc     x8, x8, x2 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// P0 = C * P1 - D * P2 computed as D * (p_256 - P2) + C * P1
+// Quotient estimation is done just as q = h + 1 as in bignum_triple_p256
+// This also applies to the other functions following.
+
+#define cmsub_p256(P0,C,P1,D,P2)                \
+        mov     x1, D __LF                         \
+        mov     x2, #-1 __LF                       \
+        ldp     x9, x10, [P2] __LF                 \
+        subs    x9, x2, x9 __LF                    \
+        mov     x2, #4294967295 __LF               \
+        sbcs    x10, x2, x10 __LF                  \
+        ldp     x11, x12, [P2+16] __LF             \
+        sbcs    x11, xzr, x11 __LF                 \
+        mov     x2, #-4294967295 __LF              \
+        sbc     x12, x2, x12 __LF                  \
+        mul     x3, x1, x9 __LF                    \
+        mul     x4, x1, x10 __LF                   \
+        mul     x5, x1, x11 __LF                   \
+        mul     x6, x1, x12 __LF                   \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        umulh   x11, x1, x11 __LF                  \
+        umulh   x7, x1, x12 __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        mov     x1, C __LF                         \
+        ldp     x9, x10, [P1] __LF                 \
+        mul     x8, x9, x1 __LF                    \
+        umulh   x9, x9, x1 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        mul     x8, x10, x1 __LF                   \
+        umulh   x10, x10, x1 __LF                  \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x11, x12, [P1+16] __LF             \
+        mul     x8, x11, x1 __LF                   \
+        umulh   x11, x11, x1 __LF                  \
+        adcs    x5, x5, x8 __LF                    \
+        mul     x8, x12, x1 __LF                   \
+        umulh   x12, x12, x1 __LF                  \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, x12 __LF                   \
+        add     x8, x7, #1 __LF                    \
+        lsl     x10, x8, #32 __LF                  \
+        adds    x6, x6, x10 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        neg     x9, x8 __LF                        \
+        sub     x10, x10, #1 __LF                  \
+        subs    x3, x3, x9 __LF                    \
+        sbcs    x4, x4, x10 __LF                   \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, x8 __LF                    \
+        sbc     x8, x7, x8 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        and     x9, x8, #4294967295 __LF           \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, xzr __LF                   \
+        neg     x10, x9 __LF                       \
+        adc     x6, x6, x10 __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// P0 = 4 * P1 - P2, by direct subtraction of P2; the method
+// in bignum_cmul_p256 etc. for quotient estimation still
+// works when the value to be reduced is negative, as
+// long as it is  > -p_256, which is the case here. The
+// actual accumulation of q * p_256 is done a bit differently
+// so it works for the q = 0 case.
+
+#define cmsub41_p256(P0,P1,P2)                  \
+        ldp     x1, x2, [P1] __LF                  \
+        lsl     x0, x1, #2 __LF                    \
+        ldp     x6, x7, [P2] __LF                  \
+        subs    x0, x0, x6 __LF                    \
+        extr    x1, x2, x1, #62 __LF               \
+        sbcs    x1, x1, x7 __LF                    \
+        ldp     x3, x4, [P1+16] __LF               \
+        extr    x2, x3, x2, #62 __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        sbcs    x2, x2, x6 __LF                    \
+        extr    x3, x4, x3, #62 __LF               \
+        sbcs    x3, x3, x7 __LF                    \
+        lsr     x4, x4, #62 __LF                   \
+        sbc     x4, x4, xzr __LF                   \
+        add     x5, x4, #1 __LF                    \
+        lsl     x8, x5, #32 __LF                   \
+        subs    x6, xzr, x8 __LF                   \
+        sbcs    x7, xzr, xzr __LF                  \
+        sbc     x8, x8, x5 __LF                    \
+        adds    x0, x0, x5 __LF                    \
+        adcs    x1, x1, x6 __LF                    \
+        adcs    x2, x2, x7 __LF                    \
+        adcs    x3, x3, x8 __LF                    \
+        csetm   x5, cc __LF                        \
+        adds    x0, x0, x5 __LF                    \
+        and     x6, x5, #4294967295 __LF           \
+        adcs    x1, x1, x6 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        neg     x7, x6 __LF                        \
+        adc     x3, x3, x7 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// P0 = 3 * P1 - 8 * P2, computed as (p_256 - P2) << 3 + 3 * P1
+
+#define cmsub38_p256(P0,P1,P2)                  \
+        mov     x1, 8 __LF                         \
+        mov     x2, #-1 __LF                       \
+        ldp     x9, x10, [P2] __LF                 \
+        subs    x9, x2, x9 __LF                    \
+        mov     x2, #4294967295 __LF               \
+        sbcs    x10, x2, x10 __LF                  \
+        ldp     x11, x12, [P2+16] __LF             \
+        sbcs    x11, xzr, x11 __LF                 \
+        mov     x2, #-4294967295 __LF              \
+        sbc     x12, x2, x12 __LF                  \
+        lsl     x3, x9, #3 __LF                    \
+        extr    x4, x10, x9, #61 __LF              \
+        extr    x5, x11, x10, #61 __LF             \
+        extr    x6, x12, x11, #61 __LF             \
+        lsr     x7, x12, #61 __LF                  \
+        mov     x1, 3 __LF                         \
+        ldp     x9, x10, [P1] __LF                 \
+        mul     x8, x9, x1 __LF                    \
+        umulh   x9, x9, x1 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        mul     x8, x10, x1 __LF                   \
+        umulh   x10, x10, x1 __LF                  \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x11, x12, [P1+16] __LF             \
+        mul     x8, x11, x1 __LF                   \
+        umulh   x11, x11, x1 __LF                  \
+        adcs    x5, x5, x8 __LF                    \
+        mul     x8, x12, x1 __LF                   \
+        umulh   x12, x12, x1 __LF                  \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, x12 __LF                   \
+        add     x8, x7, #1 __LF                    \
+        lsl     x10, x8, #32 __LF                  \
+        adds    x6, x6, x10 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        neg     x9, x8 __LF                        \
+        sub     x10, x10, #1 __LF                  \
+        subs    x3, x3, x9 __LF                    \
+        sbcs    x4, x4, x10 __LF                   \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, x8 __LF                    \
+        sbc     x8, x7, x8 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        and     x9, x8, #4294967295 __LF           \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, xzr __LF                   \
+        neg     x10, x9 __LF                       \
+        adc     x6, x6, x10 __LF                   \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(p256_montjdouble):
+
+// Save registers and make room on stack for temporary variables
+
+        sub     sp, sp, NSPACE+32
+        stp     x30, xzr, [sp, NSPACE+16]
+        stp     x19, x20, [sp, NSPACE]
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+
+// Main code, just a sequence of basic field operations
+
+// z2 = z^2
+// y2 = y^2
+
+        montsqr_p256(z2,z_1)
+        montsqr_p256(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        sub_p256(t2,x_1,z2)
+        weakadd_p256(t1,x_1,z2)
+        montmul_p256(x2p,t1,t2)
+
+// t1 = y + z
+// xy2 = x * y^2
+// x4p = x2p^2
+
+        add_p256(t1,y_1,z_1)
+        montmul_p256(xy2,x_1,y2)
+        montsqr_p256(x4p,x2p)
+
+// t1 = (y + z)^2
+
+        montsqr_p256(t1,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_p256(d_,12,xy2,9,x4p)
+        sub_p256(t1,t1,z2)
+
+// y4 = y^4
+
+        montsqr_p256(y4,y2)
+
+// dx2 = d * x2p
+
+        montmul_p256(dx2,d_,x2p)
+
+// z_3' = 2 * y * z
+
+        sub_p256(z_3,t1,y2)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_p256(x_3,xy2,d_)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_p256(y_3,dx2,y4)
+
+// Restore registers and stack and return
+
+        ldp     x19, x20, [sp, NSPACE]
+        ldp     x30, xzr, [sp, NSPACE+16]
+        add     sp, sp, NSPACE+32
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p384/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/Makefile
similarity index 96%
rename from third_party/s2n-bignum/arm/p384/Makefile
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/Makefile
index 5d64426750c..0606619cf85 100644
--- a/third_party/s2n-bignum/arm/p384/Makefile
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/Makefile
@@ -37,10 +37,8 @@ OBJ = bignum_add_p384.o \
       bignum_montinv_p384.o \
       bignum_montmul_p384.o \
       bignum_montmul_p384_alt.o \
-      bignum_montmul_p384_neon.o \
       bignum_montsqr_p384.o \
       bignum_montsqr_p384_alt.o \
-      bignum_montsqr_p384_neon.o \
       bignum_mux_6.o \
       bignum_neg_p384.o \
       bignum_nonzero_6.o \
diff --git a/third_party/s2n-bignum/arm/p384/bignum_add_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_add_p384.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_add_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_add_p384.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_bigendian_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_bigendian_6.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_bigendian_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_bigendian_6.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_cmul_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_cmul_p384.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_cmul_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_cmul_p384.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_deamont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_deamont_p384.S
similarity index 80%
rename from third_party/s2n-bignum/arm/p384/bignum_deamont_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_deamont_p384.S
index 1f84a4becf9..42d595a6bb1 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_deamont_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_deamont_p384.S
@@ -35,27 +35,27 @@
 #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
 /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64            */  \
 /* Recycle d0 (which we know gets implicitly cancelled) to store it     */  \
-        lsl     t1, d0, #32;                                        \
-        add     d0, t1, d0;                                         \
+        lsl     t1, d0, #32 __LF                                       \
+        add     d0, t1, d0 __LF                                        \
 /* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32)     */  \
 /* We need to subtract 2^32 * this, and we can ignore its lower 32      */  \
 /* bits since by design it will cancel anyway; we only need the w_hi    */  \
 /* part to get the carry propagation going.                             */  \
-        lsr     t1, d0, #32;                                        \
-        subs    t1, t1, d0;                                         \
-        sbc     t2, d0, xzr;                                        \
+        lsr     t1, d0, #32 __LF                                       \
+        subs    t1, t1, d0 __LF                                        \
+        sbc     t2, d0, xzr __LF                                       \
 /* Now select in t1 the field to subtract from d1                       */  \
-        extr    t1, t2, t1, #32;                                    \
+        extr    t1, t2, t1, #32 __LF                                   \
 /* And now get the terms to subtract from d2 and d3                     */  \
-        lsr     t2, t2, #32;                                        \
-        adds    t2, t2, d0;                                         \
-        adc     t3, xzr, xzr;                                       \
+        lsr     t2, t2, #32 __LF                                       \
+        adds    t2, t2, d0 __LF                                        \
+        adc     t3, xzr, xzr __LF                                      \
 /* Do the subtraction of that portion                                   */  \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
 /* Now effectively add 2^384 * w by taking d0 as the input for last sbc */  \
         sbc     d6, d0, xzr
 
diff --git a/third_party/s2n-bignum/arm/p384/bignum_demont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_demont_p384.S
similarity index 77%
rename from third_party/s2n-bignum/arm/p384/bignum_demont_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_demont_p384.S
index 1b095172881..eca64f62dce 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_demont_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_demont_p384.S
@@ -35,27 +35,27 @@
 #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
 /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64            */  \
 /* Recycle d0 (which we know gets implicitly cancelled) to store it     */  \
-        lsl     t1, d0, #32;                                        \
-        add     d0, t1, d0;                                         \
+        lsl     t1, d0, #32 __LF                                       \
+        add     d0, t1, d0 __LF                                        \
 /* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32)     */  \
 /* We need to subtract 2^32 * this, and we can ignore its lower 32      */  \
 /* bits since by design it will cancel anyway; we only need the w_hi    */  \
 /* part to get the carry propagation going.                             */  \
-        lsr     t1, d0, #32;                                        \
-        subs    t1, t1, d0;                                         \
-        sbc     t2, d0, xzr;                                        \
+        lsr     t1, d0, #32 __LF                                       \
+        subs    t1, t1, d0 __LF                                        \
+        sbc     t2, d0, xzr __LF                                       \
 /* Now select in t1 the field to subtract from d1                       */  \
-        extr    t1, t2, t1, #32;                                    \
+        extr    t1, t2, t1, #32 __LF                                   \
 /* And now get the terms to subtract from d2 and d3                     */  \
-        lsr     t2, t2, #32;                                        \
-        adds    t2, t2, d0;                                         \
-        adc     t3, xzr, xzr;                                       \
+        lsr     t2, t2, #32 __LF                                       \
+        adds    t2, t2, d0 __LF                                        \
+        adc     t3, xzr, xzr __LF                                      \
 /* Do the subtraction of that portion                                   */  \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
 /* Now effectively add 2^384 * w by taking d0 as the input for last sbc */  \
         sbc     d6, d0, xzr
 
diff --git a/third_party/s2n-bignum/arm/p384/bignum_double_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_double_p384.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_double_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_double_p384.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_half_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_half_p384.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_half_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_half_p384.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_inv_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_inv_p384.S
new file mode 100644
index 00000000000..111b220a304
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_inv_p384.S
@@ -0,0 +1,1469 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+//
+// extern void bignum_inv_p384(uint64_t z[static 6],uint64_t x[static 6]);
+//
+// If the 6-digit input x is coprime to p_384, i.e. is not divisible
+// by it, returns z < p_384 such that x * z == 1 (mod p_384). Note that
+// x does not need to be reduced modulo p_384, but the output always is.
+// If the input is divisible (i.e. is 0 or p_384), then there can be no
+// modular inverse and z = 0 is returned.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p384)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p384)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Used for the return pointer
+
+#define res x20
+
+// Loop counter and d = 2 * delta value for divstep
+
+#define i x21
+#define d x22
+
+// Registers used for matrix element magnitudes and signs
+
+#define m00 x10
+#define m01 x11
+#define m10 x12
+#define m11 x13
+#define s00 x14
+#define s01 x15
+#define s10 x16
+#define s11 x17
+
+// Initial carries for combinations
+
+#define car0 x9
+#define car1 x19
+
+// Input and output, plain registers treated according to pattern
+
+#define reg0 x0, #0
+#define reg1 x1, #0
+#define reg2 x2, #0
+#define reg3 x3, #0
+#define reg4 x4, #0
+
+#define x x1, #0
+#define z x0, #0
+
+// Pointer-offset pairs for temporaries on stack
+// The u and v variables are 6 words each as expected, but the f and g
+// variables are 8 words each -- they need to have at least one extra
+// word for a sign word, and to preserve alignment we "round up" to 8.
+// In fact, we currently keep an extra word in u and v as well.
+
+#define f sp, #0
+#define g sp, #(8*N)
+#define u sp, #(16*N)
+#define v sp, #(24*N)
+
+// Total size to reserve on the stack
+
+#define NSPACE #(32*N)
+
+// ---------------------------------------------------------------------------
+// Core signed almost-Montgomery reduction macro. Takes input in
+// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding
+// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary
+// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the
+// result fits in 6 digits but is not necessarily strictly reduced mod p_384.
+// ---------------------------------------------------------------------------
+
+#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
+/* We only know the input is -2^444 < x < 2^444. To do traditional  */      \
+/* unsigned Montgomery reduction, start by adding 2^61 * p_384.     */      \
+        mov     t1, #0xe000000000000000 __LF                           \
+        adds    d0, d0, t1 __LF                                        \
+        mov     t2, #0x000000001fffffff __LF                           \
+        adcs    d1, d1, t2 __LF                                        \
+        mov     t3, #0xffffffffe0000000 __LF                           \
+        bic     t3, t3, #0x2000000000000000 __LF                       \
+        adcs    d2, d2, t3 __LF                                        \
+        sbcs    d3, d3, xzr __LF                                       \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
+        mov     t1, #0x1fffffffffffffff __LF                           \
+        adc     d6, d6, t1 __LF                                        \
+/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64  */    \
+/* Store it back into d0 since we no longer need that digit.  */    \
+        add     d0, d0, d0, lsl #32 __LF                               \
+/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w                 */    \
+/* We know the lowest word will cancel d0 so we don't need it */    \
+        mov     t1, #0xffffffff00000001 __LF                           \
+        umulh   t1, t1, d0 __LF                                        \
+        mov     t2, #0x00000000ffffffff __LF                           \
+        mul     t3, t2, d0 __LF                                        \
+        umulh   t2, t2, d0 __LF                                        \
+        adds    t1, t1, t3 __LF                                        \
+        adcs    t2, t2, d0 __LF                                        \
+        cset    t3, cs __LF                                            \
+/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */     \
+/* We catch the net top carry from add-subtract in the digit d0 */  \
+        adds    d6, d6, d0 __LF                                        \
+        cset    d0, cs __LF                                            \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
+        sbcs    d6, d6, xzr __LF                                       \
+        sbcs    d0, d0, xzr __LF                                       \
+/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */    \
+        neg     d0, d0 __LF                                            \
+        and     t1, d0, #0x00000000ffffffff __LF                       \
+        and     t2, d0, #0xffffffff00000000 __LF                       \
+        and     t3, d0, #0xfffffffffffffffe __LF                       \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, d0 __LF                                        \
+        sbcs    d5, d5, d0 __LF                                        \
+        sbc     d6, d6, d0
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix in
+// registers as follows
+//
+// [ m00  m01]
+// [ m10  m11]
+
+#define divstep59()                                                     \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x8, x4, #0x100, lsl #12 __LF                               \
+        sbfx    x8, x8, #21, #21 __LF                                      \
+        mov     x11, #0x100000 __LF                                        \
+        add     x11, x11, x11, lsl #21 __LF                                \
+        add     x9, x4, x11 __LF                                           \
+        asr     x9, x9, #42 __LF                                           \
+        add     x10, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x10, x10, #21, #21 __LF                                    \
+        add     x11, x5, x11 __LF                                          \
+        asr     x11, x11, #42 __LF                                         \
+        mul     x6, x8, x2 __LF                                            \
+        mul     x7, x9, x3 __LF                                            \
+        mul     x2, x10, x2 __LF                                           \
+        mul     x3, x11, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #21, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #42 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #21, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #42 __LF                                         \
+        mul     x6, x12, x2 __LF                                           \
+        mul     x7, x13, x3 __LF                                           \
+        mul     x2, x14, x2 __LF                                           \
+        mul     x3, x15, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        mul     x2, x12, x8 __LF                                           \
+        mul     x3, x12, x9 __LF                                           \
+        mul     x6, x14, x8 __LF                                           \
+        mul     x7, x14, x9 __LF                                           \
+        madd    x8, x13, x10, x2 __LF                                      \
+        madd    x9, x13, x11, x3 __LF                                      \
+        madd    x16, x15, x10, x6 __LF                                     \
+        madd    x17, x15, x11, x7 __LF                                     \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #22, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #43 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #22, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #43 __LF                                         \
+        mneg    x2, x12, x8 __LF                                           \
+        mneg    x3, x12, x9 __LF                                           \
+        mneg    x4, x14, x8 __LF                                           \
+        mneg    x5, x14, x9 __LF                                           \
+        msub    m00, x13, x16, x2 __LF                                     \
+        msub    m01, x13, x17, x3 __LF                                     \
+        msub    m10, x15, x16, x4 __LF                                     \
+        msub    m11, x15, x17, x5
+
+S2N_BN_SYMBOL(bignum_inv_p384):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        stp     x21, x22, [sp, -16]!
+        stp     x23, x24, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Copy the prime and input into the main f and g variables respectively.
+// Make sure x is reduced so that g <= f as assumed in the bound proof.
+
+        mov     x10, #0x00000000ffffffff
+        mov     x11, #0xffffffff00000000
+        mov     x12, #0xfffffffffffffffe
+        mov     x15, #0xffffffffffffffff
+        stp     x10, x11, [f]
+        stp     x12, x15, [f+2*N]
+        stp     x15, x15, [f+4*N]
+        str     xzr, [f+6*N]
+
+        ldp     x2, x3, [x1]
+        subs    x10, x2, x10
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #(2*N)]
+        sbcs    x12, x4, x12
+        sbcs    x13, x5, x15
+        ldp     x6, x7, [x1, #(4*N)]
+        sbcs    x14, x6, x15
+        sbcs    x15, x7, x15
+
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+        csel    x6, x6, x14, cc
+        csel    x7, x7, x15, cc
+
+        stp     x2, x3, [g]
+        stp     x4, x5, [g+2*N]
+        stp     x6, x7, [g+4*N]
+        str     xzr, [g+6*N]
+
+// Also maintain reduced < 2^384 vector [u,v] such that
+// [f,g] == x * 2^{5*i-75} * [u,v] (mod p_384)
+// starting with [p_384,x] == x * 2^{5*0-75} * [0,2^75] (mod p_384)
+// The weird-looking 5*i modifications come in because we are doing
+// 64-bit word-sized Montgomery reductions at each stage, which is
+// 5 bits more than the 59-bit requirement to keep things stable.
+
+        stp     xzr, xzr, [u]
+        stp     xzr, xzr, [u+2*N]
+        stp     xzr, xzr, [u+4*N]
+
+        mov     x10, #2048
+        stp     xzr, x10, [v]
+        stp     xzr, xzr, [v+2*N]
+        stp     xzr, xzr, [v+4*N]
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special fifteenth iteration after a uniform
+// first 14.
+
+        mov     i, #15
+        mov     d, #1
+        b       bignum_inv_p384_midloop
+
+bignum_inv_p384_loop:
+
+// Separate the matrix elements into sign-magnitude pairs
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in stable registers for the [u,v] part and do [f,g] first.
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+        and     x0, m10, s10
+        and     x1, m11, s11
+        add     car1, x0, x1
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        ldr     x7, [f]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [g]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Digit 1 of [f,g]
+
+        ldr     x7, [f+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [g+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [f]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [g]
+
+// Digit 2 of [f,g]
+
+        ldr     x7, [f+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [g+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [f+N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [g+N]
+
+// Digit 3 of [f,g]
+
+        ldr     x7, [f+3*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        ldr     x8, [g+3*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [f+2*N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [g+2*N]
+
+// Digit 4 of [f,g]
+
+        ldr     x7, [f+4*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        ldr     x8, [g+4*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [f+3*N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [g+3*N]
+
+// Digits 5 and 6 of [f,g]
+
+        ldr     x7, [f+5*N]
+        eor     x1, x7, s00
+        ldr     x23, [f+6*N]
+        eor     x2, x23, s00
+        and     x2, x2, m00
+        neg     x2, x2
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        ldr     x8, [g+5*N]
+        eor     x1, x8, s01
+        ldr     x24, [g+6*N]
+        eor     x0, x24, s01
+        and     x0, x0, m01
+        sub     x2, x2, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [f+4*N]
+        extr    x4, x2, x4, #59
+        str     x4, [f+5*N]
+        asr     x2, x2, #59
+        str     x2, [f+6*N]
+
+        eor     x1, x7, s10
+        eor     x4, x23, s10
+        and     x4, x4, m10
+        neg     x4, x4
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, x5, x0
+        adc     x4, x4, x1
+        eor     x1, x8, s11
+        eor     x0, x24, s11
+        and     x0, x0, m11
+        sub     x4, x4, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x4, x4, x1
+        extr    x6, x5, x6, #59
+        str     x6, [g+4*N]
+        extr    x5, x4, x5, #59
+        str     x5, [g+5*N]
+        asr     x4, x4, #59
+        str     x4, [g+6*N]
+
+// Now the computation of the updated u and v values and their
+// Montgomery reductions. A very similar accumulation except that
+// the top words of u and v are unsigned and we don't shift.
+//
+// Digit 0 of [u,v]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v]
+        adc     x3, x3, x1
+
+// Digit 1 of [u,v]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        str     x3, [v+N]
+        adc     x4, x4, x1
+
+// Digit 2 of [u,v]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        str     x4, [v+2*N]
+        adc     x2, x2, x1
+
+// Digit 3 of [u,v]
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        str     x5, [u+3*N]
+        adc     x3, x3, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        str     x2, [v+3*N]
+        adc     x6, x6, x1
+
+// Digit 4 of [u,v]
+
+        ldr     x7, [u+4*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        ldr     x8, [v+4*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x3, x3, x0
+        str     x3, [u+4*N]
+        adc     x4, x4, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x6, x6, x0
+        str     x6, [v+4*N]
+        adc     x5, x5, x1
+
+// Digits 5 and 6 of [u,v] (top is unsigned)
+
+        ldr     x7, [u+5*N]
+        eor     x1, x7, s00
+        and     x2, s00, m00
+        neg     x2, x2
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        ldr     x8, [v+5*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x2, x2, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u+5*N]
+        adc     x2, x2, x1
+        str     x2, [u+6*N]
+
+        eor     x1, x7, s10
+        and     x4, s10, m10
+        neg     x4, x4
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, x5, x0
+        adc     x4, x4, x1
+        eor     x1, x8, s11
+        and     x0, s11, m11
+        sub     x4, x4, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v+5*N]
+        adc     x4, x4, x1
+        str     x4, [v+6*N]
+
+// Montgomery reduction of u
+
+        ldp     x0, x1, [u]
+        ldp     x2, x3, [u+16]
+        ldp     x4, x5, [u+32]
+        ldr     x6, [u+48]
+        amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7)
+        stp     x1, x2, [u]
+        stp     x3, x4, [u+16]
+        stp     x5, x6, [u+32]
+
+// Montgomery reduction of v
+
+        ldp     x0, x1, [v]
+        ldp     x2, x3, [v+16]
+        ldp     x4, x5, [v+32]
+        ldr     x6, [v+48]
+        amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7)
+        stp     x1, x2, [v]
+        stp     x3, x4, [v+16]
+        stp     x5, x6, [v+32]
+
+bignum_inv_p384_midloop:
+
+        mov     x1, d
+        ldr     x2, [f]
+        ldr     x3, [g]
+        divstep59()
+        mov     d, x1
+
+// Next iteration
+
+        subs    i, i, #1
+        bne     bignum_inv_p384_loop
+
+// The 15th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        ldr     x0, [f]
+        ldr     x1, [g]
+        mul     x0, x0, m00
+        madd    x1, x1, m01, x0
+        asr     x0, x1, #63
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * [u,v] (mod p_384)
+// we want to flip the sign of u according to that of f.
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+        eor     s00, s00, x0
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+        eor     s01, s01, x0
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+        eor     s10, s10, x0
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+        eor     s11, s11, x0
+
+// Adjust the initial value to allow for complement instead of negation
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+// Digit 0 of [u]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+// Digit 1 of [u]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+// Digit 2 of [u]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+// Digit 3 of [u]
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        str     x5, [u+3*N]
+        adc     x3, x3, x1
+
+// Digit 4 of [u]
+
+        ldr     x7, [u+4*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        ldr     x8, [v+4*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x3, x3, x0
+        str     x3, [u+4*N]
+        adc     x4, x4, x1
+
+// Digits 5 and 6 of [u] (top is unsigned)
+
+        ldr     x7, [u+5*N]
+        eor     x1, x7, s00
+        and     x2, s00, m00
+        neg     x2, x2
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        ldr     x8, [v+5*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x2, x2, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u+5*N]
+        adc     x2, x2, x1
+        str     x2, [u+6*N]
+
+// Montgomery reduction of u. This needs to be strict not "almost"
+// so it is followed by an optional subtraction of p_384
+
+        ldp     x10, x0, [u]
+        ldp     x1, x2, [u+16]
+        ldp     x3, x4, [u+32]
+        ldr     x5, [u+48]
+        amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7)
+
+        mov     x10, #0x00000000ffffffff
+        subs    x10, x0, x10
+        mov     x11, #0xffffffff00000000
+        sbcs    x11, x1, x11
+        mov     x12, #0xfffffffffffffffe
+        sbcs    x12, x2, x12
+        mov     x15, #0xffffffffffffffff
+        sbcs    x13, x3, x15
+        sbcs    x14, x4, x15
+        sbcs    x15, x5, x15
+
+        csel    x0, x0, x10, cc
+        csel    x1, x1, x11, cc
+        csel    x2, x2, x12, cc
+        csel    x3, x3, x13, cc
+        csel    x4, x4, x14, cc
+        csel    x5, x5, x15, cc
+
+// Store it back to the final output
+
+        stp     x0, x1, [res]
+        stp     x2, x3, [res, #16]
+        stp     x4, x5, [res, #32]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p384/bignum_littleendian_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_littleendian_6.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_littleendian_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_littleendian_6.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_mod_n384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384.S
similarity index 87%
rename from third_party/s2n-bignum/arm/p384/bignum_mod_n384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384.S
index a91bb2c5b5a..9aaa029e232 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_mod_n384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384.S
@@ -59,9 +59,9 @@
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                              \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 S2N_BN_SYMBOL(bignum_mod_n384):
@@ -71,7 +71,7 @@ S2N_BN_SYMBOL(bignum_mod_n384_alt):
 // If the input is already <= 5 words long, go to a trivial "copy" path
 
         cmp     k, #6
-        bcc     short
+        bcc     bignum_mod_n384_short
 
 // Otherwise load the top 6 digits (top-down) and reduce k by 6
 
@@ -105,8 +105,8 @@ S2N_BN_SYMBOL(bignum_mod_n384_alt):
 
 // Now do (k-6) iterations of 7->6 word modular reduction
 
-        cbz     k, writeback
-loop:
+        cbz     k, bignum_mod_n384_writeback
+bignum_mod_n384_loop:
 
 // Compute q = min (m5 + 1) (2^64 - 1)
 
@@ -161,11 +161,11 @@ loop:
         sbc     m5, m4, xzr
         mov     m4, t
 
-        cbnz    k, loop
+        cbnz    k, bignum_mod_n384_loop
 
 // Finally write back [m5;m4;m3;m2;m1;m0] and return
 
-writeback:
+bignum_mod_n384_writeback:
         stp     m0, m1, [z]
         stp     m2, m3, [z, #16]
         stp     m4, m5, [z, #32]
@@ -174,7 +174,7 @@ writeback:
 
 // Short case: just copy the input with zero-padding
 
-short:
+bignum_mod_n384_short:
         mov     m0, xzr
         mov     m1, xzr
         mov     m2, xzr
@@ -182,21 +182,21 @@ short:
         mov     m4, xzr
         mov     m5, xzr
 
-        cbz     k, writeback
+        cbz     k, bignum_mod_n384_writeback
         ldr     m0, [x]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_n384_writeback
         ldr     m1, [x, #8]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_n384_writeback
         ldr     m2, [x, #16]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_n384_writeback
         ldr     m3, [x, #24]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_n384_writeback
         ldr     m4, [x, #32]
-        b       writeback
+        b       bignum_mod_n384_writeback
 
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
diff --git a/third_party/s2n-bignum/arm/p384/bignum_mod_n384_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384_6.S
similarity index 91%
rename from third_party/s2n-bignum/arm/p384/bignum_mod_n384_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384_6.S
index e79ad3fe853..ad9e4b9700e 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_mod_n384_6.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384_6.S
@@ -37,9 +37,9 @@
 #define d5 x13
 
 #define movbig(nn,n3,n2,n1,n0)                                              \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 S2N_BN_SYMBOL(bignum_mod_n384_6):
diff --git a/third_party/s2n-bignum/arm/p384/bignum_mod_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384.S
similarity index 90%
rename from third_party/s2n-bignum/arm/p384/bignum_mod_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384.S
index cf7f1d6bbbf..a92548684d3 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_mod_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384.S
@@ -49,7 +49,7 @@ S2N_BN_SYMBOL(bignum_mod_p384_alt):
 // If the input is already <= 5 words long, go to a trivial "copy" path
 
         cmp     k, #6
-        bcc     short
+        bcc     bignum_mod_p384_short
 
 // Otherwise load the top 6 digits (top-down) and reduce k by 6
 
@@ -83,8 +83,8 @@ S2N_BN_SYMBOL(bignum_mod_p384_alt):
 
 // Now do (k-6) iterations of 7->6 word modular reduction
 
-        cbz     k, writeback
-loop:
+        cbz     k, bignum_mod_p384_writeback
+bignum_mod_p384_loop:
 
 // Decrement k and load the next digit as t5. We now want to reduce
 // [m5;m4;m3;m2;m1;m0;t5] |-> [m5;m4;m3;m2;m1;m0]; the shuffling downwards is
@@ -134,11 +134,11 @@ loop:
         adcs    m4, t4, n1
         adc     m5, t5, n1
 
-        cbnz    k, loop
+        cbnz    k, bignum_mod_p384_loop
 
 // Finally write back [m5;m4;m3;m2;m1;m0] and return
 
-writeback:
+bignum_mod_p384_writeback:
         stp     m0, m1, [z]
         stp     m2, m3, [z, #16]
         stp     m4, m5, [z, #32]
@@ -147,7 +147,7 @@ writeback:
 
 // Short case: just copy the input with zero-padding
 
-short:
+bignum_mod_p384_short:
         mov     m0, xzr
         mov     m1, xzr
         mov     m2, xzr
@@ -155,21 +155,21 @@ short:
         mov     m4, xzr
         mov     m5, xzr
 
-        cbz     k, writeback
+        cbz     k, bignum_mod_p384_writeback
         ldr     m0, [x]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_p384_writeback
         ldr     m1, [x, #8]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_p384_writeback
         ldr     m2, [x, #16]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_p384_writeback
         ldr     m3, [x, #24]
         subs    k, k, #1
-        beq     writeback
+        beq     bignum_mod_p384_writeback
         ldr     m4, [x, #32]
-        b       writeback
+        b       bignum_mod_p384_writeback
 
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
diff --git a/third_party/s2n-bignum/arm/p384/bignum_mod_p384_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384_6.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_mod_p384_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384_6.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montinv_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montinv_p384.S
new file mode 100644
index 00000000000..fd572e9677b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montinv_p384.S
@@ -0,0 +1,1487 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+//
+// extern void bignum_montinv_p384(uint64_t z[static 6],uint64_t x[static 6]);
+//
+// If the 6-digit input x is coprime to p_384, i.e. is not divisible
+// by it, returns z < p_384 such that x * z == 2^768 (mod p_384). This
+// is effectively "Montgomery inverse" because if we consider x and z as
+// Montgomery forms of X and Z, i.e. x == 2^384 * X and z == 2^384 * Z
+// (both mod p_384) then X * Z == 1 (mod p_384). That is, this function
+// gives the analog of the modular inverse bignum_inv_p384 but with both
+// input and output in the Montgomery domain. Note that x does not need
+// to be reduced modulo p_384, but the output always is. If the input
+// is divisible (i.e. is 0 or p_384), then there can be no solution to
+// the congruence x * z == 2^768 (mod p_384), and z = 0 is returned.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p384)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p384)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Used for the return pointer
+
+#define res x20
+
+// Loop counter and d = 2 * delta value for divstep
+
+#define i x21
+#define d x22
+
+// Registers used for matrix element magnitudes and signs
+
+#define m00 x10
+#define m01 x11
+#define m10 x12
+#define m11 x13
+#define s00 x14
+#define s01 x15
+#define s10 x16
+#define s11 x17
+
+// Initial carries for combinations
+
+#define car0 x9
+#define car1 x19
+
+// Input and output, plain registers treated according to pattern
+
+#define reg0 x0, #0
+#define reg1 x1, #0
+#define reg2 x2, #0
+#define reg3 x3, #0
+#define reg4 x4, #0
+
+#define x x1, #0
+#define z x0, #0
+
+// Pointer-offset pairs for temporaries on stack
+// The u and v variables are 6 words each as expected, but the f and g
+// variables are 8 words each -- they need to have at least one extra
+// word for a sign word, and to preserve alignment we "round up" to 8.
+// In fact, we currently keep an extra word in u and v as well.
+
+#define f sp, #0
+#define g sp, #(8*N)
+#define u sp, #(16*N)
+#define v sp, #(24*N)
+
+// Total size to reserve on the stack
+
+#define NSPACE #(32*N)
+
+// ---------------------------------------------------------------------------
+// Core signed almost-Montgomery reduction macro. Takes input in
+// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding
+// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary
+// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the
+// result fits in 6 digits but is not necessarily strictly reduced mod p_384.
+// ---------------------------------------------------------------------------
+
+#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
+/* We only know the input is -2^444 < x < 2^444. To do traditional  */      \
+/* unsigned Montgomery reduction, start by adding 2^61 * p_384.     */      \
+        mov     t1, #0xe000000000000000 __LF                           \
+        adds    d0, d0, t1 __LF                                        \
+        mov     t2, #0x000000001fffffff __LF                           \
+        adcs    d1, d1, t2 __LF                                        \
+        mov     t3, #0xffffffffe0000000 __LF                           \
+        bic     t3, t3, #0x2000000000000000 __LF                       \
+        adcs    d2, d2, t3 __LF                                        \
+        sbcs    d3, d3, xzr __LF                                       \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
+        mov     t1, #0x1fffffffffffffff __LF                           \
+        adc     d6, d6, t1 __LF                                        \
+/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64  */    \
+/* Store it back into d0 since we no longer need that digit.  */    \
+        add     d0, d0, d0, lsl #32 __LF                               \
+/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w                 */    \
+/* We know the lowest word will cancel d0 so we don't need it */    \
+        mov     t1, #0xffffffff00000001 __LF                           \
+        umulh   t1, t1, d0 __LF                                        \
+        mov     t2, #0x00000000ffffffff __LF                           \
+        mul     t3, t2, d0 __LF                                        \
+        umulh   t2, t2, d0 __LF                                        \
+        adds    t1, t1, t3 __LF                                        \
+        adcs    t2, t2, d0 __LF                                        \
+        cset    t3, cs __LF                                            \
+/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */     \
+/* We catch the net top carry from add-subtract in the digit d0 */  \
+        adds    d6, d6, d0 __LF                                        \
+        cset    d0, cs __LF                                            \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
+        sbcs    d6, d6, xzr __LF                                       \
+        sbcs    d0, d0, xzr __LF                                       \
+/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */    \
+        neg     d0, d0 __LF                                            \
+        and     t1, d0, #0x00000000ffffffff __LF                       \
+        and     t2, d0, #0xffffffff00000000 __LF                       \
+        and     t3, d0, #0xfffffffffffffffe __LF                       \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, d0 __LF                                        \
+        sbcs    d5, d5, d0 __LF                                        \
+        sbc     d6, d6, d0
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix in
+// registers as follows
+//
+// [ m00  m01]
+// [ m10  m11]
+
+#define divstep59()                                                     \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x8, x4, #0x100, lsl #12 __LF                               \
+        sbfx    x8, x8, #21, #21 __LF                                      \
+        mov     x11, #0x100000 __LF                                        \
+        add     x11, x11, x11, lsl #21 __LF                                \
+        add     x9, x4, x11 __LF                                           \
+        asr     x9, x9, #42 __LF                                           \
+        add     x10, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x10, x10, #21, #21 __LF                                    \
+        add     x11, x5, x11 __LF                                          \
+        asr     x11, x11, #42 __LF                                         \
+        mul     x6, x8, x2 __LF                                            \
+        mul     x7, x9, x3 __LF                                            \
+        mul     x2, x10, x2 __LF                                           \
+        mul     x3, x11, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #21, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #42 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #21, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #42 __LF                                         \
+        mul     x6, x12, x2 __LF                                           \
+        mul     x7, x13, x3 __LF                                           \
+        mul     x2, x14, x2 __LF                                           \
+        mul     x3, x15, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        mul     x2, x12, x8 __LF                                           \
+        mul     x3, x12, x9 __LF                                           \
+        mul     x6, x14, x8 __LF                                           \
+        mul     x7, x14, x9 __LF                                           \
+        madd    x8, x13, x10, x2 __LF                                      \
+        madd    x9, x13, x11, x3 __LF                                      \
+        madd    x16, x15, x10, x6 __LF                                     \
+        madd    x17, x15, x11, x7 __LF                                     \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #22, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #43 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #22, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #43 __LF                                         \
+        mneg    x2, x12, x8 __LF                                           \
+        mneg    x3, x12, x9 __LF                                           \
+        mneg    x4, x14, x8 __LF                                           \
+        mneg    x5, x14, x9 __LF                                           \
+        msub    m00, x13, x16, x2 __LF                                     \
+        msub    m01, x13, x17, x3 __LF                                     \
+        msub    m10, x15, x16, x4 __LF                                     \
+        msub    m11, x15, x17, x5
+
+S2N_BN_SYMBOL(bignum_montinv_p384):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        stp     x21, x22, [sp, -16]!
+        stp     x23, x24, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Copy the prime and input into the main f and g variables respectively.
+// Make sure x is reduced so that g <= f as assumed in the bound proof.
+
+        mov     x10, #0x00000000ffffffff
+        mov     x11, #0xffffffff00000000
+        mov     x12, #0xfffffffffffffffe
+        mov     x15, #0xffffffffffffffff
+        stp     x10, x11, [f]
+        stp     x12, x15, [f+2*N]
+        stp     x15, x15, [f+4*N]
+        str     xzr, [f+6*N]
+
+        ldp     x2, x3, [x1]
+        subs    x10, x2, x10
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #(2*N)]
+        sbcs    x12, x4, x12
+        sbcs    x13, x5, x15
+        ldp     x6, x7, [x1, #(4*N)]
+        sbcs    x14, x6, x15
+        sbcs    x15, x7, x15
+
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+        csel    x6, x6, x14, cc
+        csel    x7, x7, x15, cc
+
+        stp     x2, x3, [g]
+        stp     x4, x5, [g+2*N]
+        stp     x6, x7, [g+4*N]
+        str     xzr, [g+6*N]
+
+// Also maintain reduced < 2^384 vector [u,v] such that
+// [f,g] == x * 2^{5*i-843} * [u,v] (mod p_384)
+// starting with [p_384,x] == x * 2^{5*0-843} * [0,2^843] (mod p_384)
+// The weird-looking 5*i modifications come in because we are doing
+// 64-bit word-sized Montgomery reductions at each stage, which is
+// 5 bits more than the 59-bit requirement to keep things stable.
+// After the 15th and last iteration and sign adjustment, when
+// f == 1 for in-scope cases, we have x * 2^{75-843} * u == 1, i.e.
+// x * u == 2^768 as required.
+
+        stp     xzr, xzr, [u]
+        stp     xzr, xzr, [u+2*N]
+        stp     xzr, xzr, [u+4*N]
+
+// The starting constant 2^843 mod p_384 is
+// 0x0000000000000800:00001000000007ff:fffff00000000000
+//  :00001000000007ff:fffff00000000800:0000000000000000
+// where colons separate 64-bit subwords, least significant at the right.
+// Not all of these are single loads on ARM so this is a bit dynamic
+
+        mov     x12, #0xfffff00000000000
+        orr     x10, x12, #0x0000000000000800
+        stp     xzr, x10, [v]
+        mov     x11, #0x00000000000007ff
+        orr     x11, x11, #0x0000100000000000
+        stp     x11, x12, [v+2*N]
+        mov     x12, #0x0000000000000800
+        stp     x11, x12, [v+4*N]
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special fifteenth iteration after a uniform
+// first 14.
+
+        mov     i, #15
+        mov     d, #1
+        b       bignum_montinv_p384_midloop
+
+bignum_montinv_p384_loop:
+
+// Separate the matrix elements into sign-magnitude pairs
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in stable registers for the [u,v] part and do [f,g] first.
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+        and     x0, m10, s10
+        and     x1, m11, s11
+        add     car1, x0, x1
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        ldr     x7, [f]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [g]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Digit 1 of [f,g]
+
+        ldr     x7, [f+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [g+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [f]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [g]
+
+// Digit 2 of [f,g]
+
+        ldr     x7, [f+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [g+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [f+N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [g+N]
+
+// Digit 3 of [f,g]
+
+        ldr     x7, [f+3*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        ldr     x8, [g+3*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [f+2*N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [g+2*N]
+
+// Digit 4 of [f,g]
+
+        ldr     x7, [f+4*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        ldr     x8, [g+4*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [f+3*N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [g+3*N]
+
+// Digits 5 and 6 of [f,g]
+
+        ldr     x7, [f+5*N]
+        eor     x1, x7, s00
+        ldr     x23, [f+6*N]
+        eor     x2, x23, s00
+        and     x2, x2, m00
+        neg     x2, x2
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        ldr     x8, [g+5*N]
+        eor     x1, x8, s01
+        ldr     x24, [g+6*N]
+        eor     x0, x24, s01
+        and     x0, x0, m01
+        sub     x2, x2, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [f+4*N]
+        extr    x4, x2, x4, #59
+        str     x4, [f+5*N]
+        asr     x2, x2, #59
+        str     x2, [f+6*N]
+
+        eor     x1, x7, s10
+        eor     x4, x23, s10
+        and     x4, x4, m10
+        neg     x4, x4
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, x5, x0
+        adc     x4, x4, x1
+        eor     x1, x8, s11
+        eor     x0, x24, s11
+        and     x0, x0, m11
+        sub     x4, x4, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x4, x4, x1
+        extr    x6, x5, x6, #59
+        str     x6, [g+4*N]
+        extr    x5, x4, x5, #59
+        str     x5, [g+5*N]
+        asr     x4, x4, #59
+        str     x4, [g+6*N]
+
+// Now the computation of the updated u and v values and their
+// Montgomery reductions. A very similar accumulation except that
+// the top words of u and v are unsigned and we don't shift.
+//
+// Digit 0 of [u,v]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v]
+        adc     x3, x3, x1
+
+// Digit 1 of [u,v]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        str     x3, [v+N]
+        adc     x4, x4, x1
+
+// Digit 2 of [u,v]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        str     x4, [v+2*N]
+        adc     x2, x2, x1
+
+// Digit 3 of [u,v]
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        str     x5, [u+3*N]
+        adc     x3, x3, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        str     x2, [v+3*N]
+        adc     x6, x6, x1
+
+// Digit 4 of [u,v]
+
+        ldr     x7, [u+4*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        ldr     x8, [v+4*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x3, x3, x0
+        str     x3, [u+4*N]
+        adc     x4, x4, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x6, x6, x0
+        str     x6, [v+4*N]
+        adc     x5, x5, x1
+
+// Digits 5 and 6 of [u,v] (top is unsigned)
+
+        ldr     x7, [u+5*N]
+        eor     x1, x7, s00
+        and     x2, s00, m00
+        neg     x2, x2
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        ldr     x8, [v+5*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x2, x2, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u+5*N]
+        adc     x2, x2, x1
+        str     x2, [u+6*N]
+
+        eor     x1, x7, s10
+        and     x4, s10, m10
+        neg     x4, x4
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, x5, x0
+        adc     x4, x4, x1
+        eor     x1, x8, s11
+        and     x0, s11, m11
+        sub     x4, x4, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v+5*N]
+        adc     x4, x4, x1
+        str     x4, [v+6*N]
+
+// Montgomery reduction of u
+
+        ldp     x0, x1, [u]
+        ldp     x2, x3, [u+16]
+        ldp     x4, x5, [u+32]
+        ldr     x6, [u+48]
+        amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7)
+        stp     x1, x2, [u]
+        stp     x3, x4, [u+16]
+        stp     x5, x6, [u+32]
+
+// Montgomery reduction of v
+
+        ldp     x0, x1, [v]
+        ldp     x2, x3, [v+16]
+        ldp     x4, x5, [v+32]
+        ldr     x6, [v+48]
+        amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7)
+        stp     x1, x2, [v]
+        stp     x3, x4, [v+16]
+        stp     x5, x6, [v+32]
+
+bignum_montinv_p384_midloop:
+
+        mov     x1, d
+        ldr     x2, [f]
+        ldr     x3, [g]
+        divstep59()
+        mov     d, x1
+
+// Next iteration
+
+        subs    i, i, #1
+        bne     bignum_montinv_p384_loop
+
+// The 15th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        ldr     x0, [f]
+        ldr     x1, [g]
+        mul     x0, x0, m00
+        madd    x1, x1, m01, x0
+        asr     x0, x1, #63
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * 2^{-768} [u,v] (mod p_384)
+// we want to flip the sign of u according to that of f.
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+        eor     s00, s00, x0
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+        eor     s01, s01, x0
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+        eor     s10, s10, x0
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+        eor     s11, s11, x0
+
+// Adjust the initial value to allow for complement instead of negation
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+// Digit 0 of [u]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+// Digit 1 of [u]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+// Digit 2 of [u]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+// Digit 3 of [u]
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        str     x5, [u+3*N]
+        adc     x3, x3, x1
+
+// Digit 4 of [u]
+
+        ldr     x7, [u+4*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        ldr     x8, [v+4*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x3, x3, x0
+        str     x3, [u+4*N]
+        adc     x4, x4, x1
+
+// Digits 5 and 6 of [u] (top is unsigned)
+
+        ldr     x7, [u+5*N]
+        eor     x1, x7, s00
+        and     x2, s00, m00
+        neg     x2, x2
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        ldr     x8, [v+5*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x2, x2, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u+5*N]
+        adc     x2, x2, x1
+        str     x2, [u+6*N]
+
+// Montgomery reduction of u. This needs to be strict not "almost"
+// so it is followed by an optional subtraction of p_384
+
+        ldp     x10, x0, [u]
+        ldp     x1, x2, [u+16]
+        ldp     x3, x4, [u+32]
+        ldr     x5, [u+48]
+        amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7)
+
+        mov     x10, #0x00000000ffffffff
+        subs    x10, x0, x10
+        mov     x11, #0xffffffff00000000
+        sbcs    x11, x1, x11
+        mov     x12, #0xfffffffffffffffe
+        sbcs    x12, x2, x12
+        mov     x15, #0xffffffffffffffff
+        sbcs    x13, x3, x15
+        sbcs    x14, x4, x15
+        sbcs    x15, x5, x15
+
+        csel    x0, x0, x10, cc
+        csel    x1, x1, x11, cc
+        csel    x2, x2, x12, cc
+        csel    x3, x3, x13, cc
+        csel    x4, x4, x14, cc
+        csel    x5, x5, x15, cc
+
+// Store it back to the final output
+
+        stp     x0, x1, [res]
+        stp     x2, x3, [res, #16]
+        stp     x4, x5, [res, #32]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384.S
similarity index 97%
rename from third_party/s2n-bignum/arm/p384/bignum_montmul_p384_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384.S
index 08c296bc0d2..60a960c5223 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384.S
@@ -5,7 +5,7 @@
 // Montgomery multiply, z := (x * y / 2^384) mod p_384
 // Inputs x[6], y[6]; output z[6]
 //
-//    extern void bignum_montmul_p384_neon
+//    extern void bignum_montmul_p384
 //     (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
 //
 // Does z := (2^{-384} * x * y) mod p_384, assuming that the inputs x and y
@@ -15,7 +15,8 @@
 // Standard ARM ABI: X0 = z, X1 = x, X2 = y
 // ----------------------------------------------------------------------------
 
-// bignum_montmul_p384_neon is functionally equivalent to bignum_montmul_p384.
+// bignum_montmul_p384 is functionally equivalent to
+// unopt/bignum_montmul_p384_base.
 // It is written in a way that
 // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully
 //    chosen and vectorized
@@ -24,9 +25,9 @@
 //
 // The output program of step 1. is as follows:
 //
-//        stp	x19, x20, [sp, #-16]!
-//        stp	x21, x22, [sp, #-16]!
-//        stp	x23, x24, [sp, #-16]!
+//        stp   x19, x20, [sp, #-16]!
+//        stp   x21, x22, [sp, #-16]!
+//        stp   x23, x24, [sp, #-16]!
 //        ldp x3, x21, [x1]
 //        ldr q30, [x1]
 //        ldp x8, x24, [x1, #16]
@@ -433,9 +434,9 @@
 //        stp x10, x5, [x0]                       // @slothy:writes=buffer0
 //        stp x24, x8, [x0, #16]                  // @slothy:writes=buffer16
 //        stp x21, x2, [x0, #32]                  // @slothy:writes=buffer32
-//        ldp	x23, x24, [sp], #16
-//        ldp	x21, x22, [sp], #16
-//        ldp	x19, x20, [sp], #16
+//        ldp   x23, x24, [sp], #16
+//        ldp   x21, x22, [sp], #16
+//        ldp   x19, x20, [sp], #16
 //        ret
 //
 // The bash script used for step 2 is as follows:
@@ -452,12 +453,12 @@
 
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384)
         .text
         .balign 4
 
-S2N_BN_SYMBOL(bignum_montmul_p384_neon):
+S2N_BN_SYMBOL(bignum_montmul_p384):
 
 // Save some registers
 
diff --git a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384_alt.S
similarity index 88%
rename from third_party/s2n-bignum/arm/p384/bignum_montmul_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384_alt.S
index a6464f07cc7..c44ca21f249 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384_alt.S
@@ -34,24 +34,24 @@
 #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
 /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64            */  \
 /* Store it in d6 to make the 2^384 * w contribution already            */  \
-        lsl     t1, d0, #32;                                        \
-        add     d6, t1, d0;                                         \
+        lsl     t1, d0, #32 __LF                                       \
+        add     d6, t1, d0 __LF                                        \
 /* Now let [t3;t2;t1;-] = (2^384 - p_384) * w                    */         \
 /* We know the lowest word will cancel d0 so we don't need it    */         \
-        mov     t1, #0xffffffff00000001;                            \
-        umulh   t1, t1, d6;                                         \
-        mov     t2, #0x00000000ffffffff;                            \
-        mul     t3, t2, d6;                                         \
-        umulh   t2, t2, d6;                                         \
-        adds    t1, t1, t3;                                         \
-        adcs    t2, t2, d6;                                         \
-        adc     t3, xzr, xzr;                                       \
+        mov     t1, #0xffffffff00000001 __LF                           \
+        umulh   t1, t1, d6 __LF                                        \
+        mov     t2, #0x00000000ffffffff __LF                           \
+        mul     t3, t2, d6 __LF                                        \
+        umulh   t2, t2, d6 __LF                                        \
+        adds    t1, t1, t3 __LF                                        \
+        adcs    t2, t2, d6 __LF                                        \
+        adc     t3, xzr, xzr __LF                                      \
 /* Now add it, by subtracting from 2^384 * w + x */                         \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
         sbc     d6, d6, xzr
 
 
diff --git a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384.S
similarity index 98%
rename from third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384.S
index 9be6380eb44..8468628b1eb 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384.S
@@ -5,7 +5,7 @@
 // Montgomery square, z := (x^2 / 2^384) mod p_384
 // Input x[6]; output z[6]
 //
-//    extern void bignum_montsqr_p384_neon
+//    extern void bignum_montsqr_p384
 //     (uint64_t z[static 6], uint64_t x[static 6]);
 //
 // Does z := (x^2 / 2^384) mod p_384, assuming x^2 <= 2^384 * p_384, which is
@@ -14,7 +14,8 @@
 // Standard ARM ABI: X0 = z, X1 = x
 // ----------------------------------------------------------------------------
 
-// bignum_montsqr_p384_neon is functionally equivalent to bignum_montsqr_p384.
+// bignum_montsqr_p384 is functionally equivalent to
+// unopt/bignum_montsqr_p384_base.
 // It is written in a way that
 // 1. A subset of scalar multiplications in bignum_montsqr_p384 are carefully
 //    chosen and vectorized
@@ -344,12 +345,12 @@
 
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384)
         .text
         .balign 4
 
-S2N_BN_SYMBOL(bignum_montsqr_p384_neon):
+S2N_BN_SYMBOL(bignum_montsqr_p384):
 
         ldr q1, [x1]
         ldp x9, x2, [x1]
diff --git a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384_alt.S
similarity index 86%
rename from third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384_alt.S
index f49830d21ed..609a4bb4bf9 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384_alt.S
@@ -33,24 +33,24 @@
 #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
 /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64            */  \
 /* Store it in d6 to make the 2^384 * w contribution already            */  \
-        lsl     t1, d0, #32;                                        \
-        add     d6, t1, d0;                                         \
+        lsl     t1, d0, #32 __LF                                       \
+        add     d6, t1, d0 __LF                                        \
 /* Now let [t3;t2;t1;-] = (2^384 - p_384) * w                    */         \
 /* We know the lowest word will cancel d0 so we don't need it    */         \
-        mov     t1, #0xffffffff00000001;                            \
-        umulh   t1, t1, d6;                                         \
-        mov     t2, #0x00000000ffffffff;                            \
-        mul     t3, t2, d6;                                         \
-        umulh   t2, t2, d6;                                         \
-        adds    t1, t1, t3;                                         \
-        adcs    t2, t2, d6;                                         \
-        adc     t3, xzr, xzr;                                       \
+        mov     t1, #0xffffffff00000001 __LF                           \
+        umulh   t1, t1, d6 __LF                                        \
+        mov     t2, #0x00000000ffffffff __LF                           \
+        mul     t3, t2, d6 __LF                                        \
+        umulh   t2, t2, d6 __LF                                        \
+        adds    t1, t1, t3 __LF                                        \
+        adcs    t2, t2, d6 __LF                                        \
+        adc     t3, xzr, xzr __LF                                      \
 /* Now add it, by subtracting from 2^384 * w + x */                         \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
         sbc     d6, d6, xzr
 
 #define z x0
diff --git a/third_party/s2n-bignum/arm/p384/bignum_mux_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mux_6.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_mux_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mux_6.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_neg_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_neg_p384.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_neg_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_neg_p384.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_nonzero_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_nonzero_6.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_nonzero_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_nonzero_6.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_optneg_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_optneg_p384.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_optneg_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_optneg_p384.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_sub_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_sub_p384.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_sub_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_sub_p384.S
diff --git a/third_party/s2n-bignum/arm/p384/bignum_tomont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_tomont_p384.S
similarity index 60%
rename from third_party/s2n-bignum/arm/p384/bignum_tomont_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_tomont_p384.S
index c666f5e78fc..c371505bc77 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_tomont_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_tomont_p384.S
@@ -27,38 +27,38 @@
 
 #define modstep_p384(d6,d5,d4,d3,d2,d1,d0, t1,t2,t3)                        \
 /* Initial quotient approximation q = min (h + 1) (2^64 - 1) */             \
-        adds    d6, d6, #1;                                         \
-        csetm   t3, cs;                                             \
-        add     d6, d6, t3;                                         \
-        orn     t3, xzr, t3;                                        \
-        sub     t2, d6, #1;                                         \
-        sub     t1, xzr, d6;                                        \
+        adds    d6, d6, #1 __LF                                        \
+        csetm   t3, cs __LF                                            \
+        add     d6, d6, t3 __LF                                        \
+        orn     t3, xzr, t3 __LF                                       \
+        sub     t2, d6, #1 __LF                                        \
+        sub     t1, xzr, d6 __LF                                       \
 /* Correction term [d6;t2;t1;d0] = q * (2^384 - p_384) */                   \
-        lsl     d0, t1, #32;                                        \
-        extr    t1, t2, t1, #32;                                    \
-        lsr     t2, t2, #32;                                        \
-        adds    d0, d0, d6;                                         \
-        adcs    t1, t1, xzr;                                        \
-        adcs    t2, t2, d6;                                         \
-        adc     d6, xzr, xzr;                                       \
+        lsl     d0, t1, #32 __LF                                       \
+        extr    t1, t2, t1, #32 __LF                                   \
+        lsr     t2, t2, #32 __LF                                       \
+        adds    d0, d0, d6 __LF                                        \
+        adcs    t1, t1, xzr __LF                                       \
+        adcs    t2, t2, d6 __LF                                        \
+        adc     d6, xzr, xzr __LF                                      \
 /* Addition to the initial value */                                         \
-        adds    d1, d1, t1;                                         \
-        adcs    d2, d2, t2;                                         \
-        adcs    d3, d3, d6;                                         \
-        adcs    d4, d4, xzr;                                        \
-        adcs    d5, d5, xzr;                                        \
-        adc     t3, t3, xzr;                                        \
+        adds    d1, d1, t1 __LF                                        \
+        adcs    d2, d2, t2 __LF                                        \
+        adcs    d3, d3, d6 __LF                                        \
+        adcs    d4, d4, xzr __LF                                       \
+        adcs    d5, d5, xzr __LF                                       \
+        adc     t3, t3, xzr __LF                                       \
 /* Use net top of the 7-word answer in t3 for masked correction */          \
-        mov     t1, #0x00000000ffffffff;                            \
-        and     t1, t1, t3;                                         \
-        adds    d0, d0, t1;                                         \
-        eor     t1, t1, t3;                                         \
-        adcs    d1, d1, t1;                                         \
-        mov     t1, #0xfffffffffffffffe;                            \
-        and     t1, t1, t3;                                         \
-        adcs    d2, d2, t1;                                         \
-        adcs    d3, d3, t3;                                         \
-        adcs    d4, d4, t3;                                         \
+        mov     t1, #0x00000000ffffffff __LF                           \
+        and     t1, t1, t3 __LF                                        \
+        adds    d0, d0, t1 __LF                                        \
+        eor     t1, t1, t3 __LF                                        \
+        adcs    d1, d1, t1 __LF                                        \
+        mov     t1, #0xfffffffffffffffe __LF                           \
+        and     t1, t1, t3 __LF                                        \
+        adcs    d2, d2, t1 __LF                                        \
+        adcs    d3, d3, t3 __LF                                        \
+        adcs    d4, d4, t3 __LF                                        \
         adc     d5, d5, t3
 
 S2N_BN_SYMBOL(bignum_tomont_p384):
diff --git a/third_party/s2n-bignum/arm/p384/bignum_triple_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_triple_p384.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/bignum_triple_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_triple_p384.S
diff --git a/third_party/s2n-bignum/arm/p384/p384_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/p384_montjadd.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd_alt.S
new file mode 100644
index 00000000000..e5ccc45833f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd_alt.S
@@ -0,0 +1,993 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates
+//
+//    extern void p384_montjadd_alt
+//      (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd_alt)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 48
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x24
+#define input_x x25
+#define input_y x26
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+#define z_2 input_y, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define x1a sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define z2sq sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define y1a sp, #(NUMSIZE*6)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds exactly to bignum_montmul_p384_alt
+
+#define montmul_p384(P0,P1,P2)                  \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x12, x3, x5 __LF                   \
+        umulh   x13, x3, x5 __LF                   \
+        mul     x11, x3, x6 __LF                   \
+        umulh   x14, x3, x6 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x7, x8, [P2+16] __LF               \
+        mul     x11, x3, x7 __LF                   \
+        umulh   x15, x3, x7 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x16, x3, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        ldp     x9, x10, [P2+32] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x17, x3, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x19, x3, x10 __LF                  \
+        adcs    x17, x17, x11 __LF                 \
+        adc     x19, x19, xzr __LF                 \
+        mul     x11, x4, x5 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x6 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x7 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x19, x19, x11 __LF                 \
+        cset    x20, cs __LF                       \
+        umulh   x11, x4, x5 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x6 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x7 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x4, x10 __LF                  \
+        adc     x20, x20, x11 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x3, x6 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x3, x7 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x3, x8 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x3, x9 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        adcs    x20, x20, x11 __LF                 \
+        cset    x21, cs __LF                       \
+        umulh   x11, x3, x5 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x3, x6 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x3, x7 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        umulh   x11, x3, x8 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x3, x9 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x3, x10 __LF                  \
+        adc     x21, x21, x11 __LF                 \
+        mul     x11, x4, x5 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x4, x6 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x4, x7 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x21, x21, x11 __LF                 \
+        cset    x22, cs __LF                       \
+        umulh   x11, x4, x5 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x4, x6 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        umulh   x11, x4, x7 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        umulh   x11, x4, x10 __LF                  \
+        adc     x22, x22, x11 __LF                 \
+        ldp     x3, x4, [P1+32] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        mul     x11, x3, x6 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x3, x7 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x3, x8 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        mul     x11, x3, x9 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        adcs    x22, x22, x11 __LF                 \
+        cset    x2, cs __LF                        \
+        umulh   x11, x3, x5 __LF                   \
+        adds    x17, x17, x11 __LF                 \
+        umulh   x11, x3, x6 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x3, x7 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x3, x8 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        umulh   x11, x3, x9 __LF                   \
+        adcs    x22, x22, x11 __LF                 \
+        umulh   x11, x3, x10 __LF                  \
+        adc     x2, x2, x11 __LF                   \
+        mul     x11, x4, x5 __LF                   \
+        adds    x17, x17, x11 __LF                 \
+        mul     x11, x4, x6 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x4, x7 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x22, x22, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x2, x2, x11 __LF                   \
+        cset    x1, cs __LF                        \
+        umulh   x11, x4, x5 __LF                   \
+        adds    x19, x19, x11 __LF                 \
+        umulh   x11, x4, x6 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x4, x7 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x22, x22, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x2, x2, x11 __LF                   \
+        umulh   x11, x4, x10 __LF                  \
+        adc     x1, x1, x11 __LF                   \
+        lsl     x7, x12, #32 __LF                  \
+        add     x12, x7, x12 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x12 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x12 __LF                   \
+        umulh   x6, x6, x12 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x12 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x13, x13, x7 __LF                  \
+        sbcs    x14, x14, x6 __LF                  \
+        sbcs    x15, x15, x5 __LF                  \
+        sbcs    x16, x16, xzr __LF                 \
+        sbcs    x17, x17, xzr __LF                 \
+        sbc     x12, x12, xzr __LF                 \
+        lsl     x7, x13, #32 __LF                  \
+        add     x13, x7, x13 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x13 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x13 __LF                   \
+        umulh   x6, x6, x13 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x13 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x14, x14, x7 __LF                  \
+        sbcs    x15, x15, x6 __LF                  \
+        sbcs    x16, x16, x5 __LF                  \
+        sbcs    x17, x17, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        lsl     x7, x14, #32 __LF                  \
+        add     x14, x7, x14 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x14 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x14 __LF                   \
+        umulh   x6, x6, x14 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x14 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x15, x15, x7 __LF                  \
+        sbcs    x16, x16, x6 __LF                  \
+        sbcs    x17, x17, x5 __LF                  \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x14, x14, xzr __LF                 \
+        lsl     x7, x15, #32 __LF                  \
+        add     x15, x7, x15 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x15 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x15 __LF                   \
+        umulh   x6, x6, x15 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x15 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x16, x16, x7 __LF                  \
+        sbcs    x17, x17, x6 __LF                  \
+        sbcs    x12, x12, x5 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        lsl     x7, x16, #32 __LF                  \
+        add     x16, x7, x16 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x16 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x16 __LF                   \
+        umulh   x6, x6, x16 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x16 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x17, x17, x7 __LF                  \
+        sbcs    x12, x12, x6 __LF                  \
+        sbcs    x13, x13, x5 __LF                  \
+        sbcs    x14, x14, xzr __LF                 \
+        sbcs    x15, x15, xzr __LF                 \
+        sbc     x16, x16, xzr __LF                 \
+        lsl     x7, x17, #32 __LF                  \
+        add     x17, x7, x17 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x17 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x17 __LF                   \
+        umulh   x6, x6, x17 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x17 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, x6 __LF                  \
+        sbcs    x14, x14, x5 __LF                  \
+        sbcs    x15, x15, xzr __LF                 \
+        sbcs    x16, x16, xzr __LF                 \
+        sbc     x17, x17, xzr __LF                 \
+        adds    x12, x12, x19 __LF                 \
+        adcs    x13, x13, x20 __LF                 \
+        adcs    x14, x14, x21 __LF                 \
+        adcs    x15, x15, x22 __LF                 \
+        adcs    x16, x16, x2 __LF                  \
+        adcs    x17, x17, x1 __LF                  \
+        adc     x10, xzr, xzr __LF                 \
+        mov     x11, #0xffffffff00000001 __LF      \
+        adds    x19, x12, x11 __LF                 \
+        mov     x11, #0xffffffff __LF              \
+        adcs    x20, x13, x11 __LF                 \
+        mov     x11, #0x1 __LF                     \
+        adcs    x21, x14, x11 __LF                 \
+        adcs    x22, x15, xzr __LF                 \
+        adcs    x2, x16, xzr __LF                  \
+        adcs    x1, x17, xzr __LF                  \
+        adcs    x10, x10, xzr __LF                 \
+        csel    x12, x12, x19, eq __LF             \
+        csel    x13, x13, x20, eq __LF             \
+        csel    x14, x14, x21, eq __LF             \
+        csel    x15, x15, x22, eq __LF             \
+        csel    x16, x16, x2, eq __LF              \
+        csel    x17, x17, x1, eq __LF              \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16] __LF             \
+        stp     x16, x17, [P0+32]
+
+// Corresponds exactly to bignum_montsqr_p384_alt
+
+#define montsqr_p384(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x8, x2, x4 __LF                    \
+        adds    x10, x10, x8 __LF                  \
+        mul     x11, x2, x5 __LF                   \
+        mul     x8, x3, x4 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x8, x3, x5 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        ldp     x6, x7, [P1+32] __LF               \
+        mul     x13, x2, x7 __LF                   \
+        mul     x8, x3, x6 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x14, x2, x7 __LF                   \
+        mul     x8, x3, x7 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x15, x5, x6 __LF                   \
+        adcs    x15, x15, xzr __LF                 \
+        umulh   x16, x5, x6 __LF                   \
+        adc     x16, x16, xzr __LF                 \
+        umulh   x8, x2, x4 __LF                    \
+        adds    x11, x11, x8 __LF                  \
+        umulh   x8, x3, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x3, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x8, x3, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x3, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        adc     x16, x16, xzr __LF                 \
+        mul     x8, x2, x6 __LF                    \
+        adds    x12, x12, x8 __LF                  \
+        mul     x8, x4, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x4, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x8, x4, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x5, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        mul     x17, x6, x7 __LF                   \
+        adcs    x17, x17, xzr __LF                 \
+        umulh   x19, x6, x7 __LF                   \
+        adc     x19, x19, xzr __LF                 \
+        umulh   x8, x2, x6 __LF                    \
+        adds    x13, x13, x8 __LF                  \
+        umulh   x8, x4, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x4, x6 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        umulh   x8, x4, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x5, x7 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        adc     x19, x19, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adcs    x17, x17, x17 __LF                 \
+        adcs    x19, x19, x19 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x8, x2, x2 __LF                    \
+        mul     x2, x2, x2 __LF                    \
+        adds    x9, x9, x8 __LF                    \
+        mul     x8, x3, x3 __LF                    \
+        adcs    x10, x10, x8 __LF                  \
+        umulh   x8, x3, x3 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        mul     x8, x4, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x4, x4 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x5, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x5, x5 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x6, x6 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x6, x6 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        mul     x8, x7, x7 __LF                    \
+        adcs    x19, x19, x8 __LF                  \
+        umulh   x8, x7, x7 __LF                    \
+        adc     x20, x20, x8 __LF                  \
+        lsl     x5, x2, #32 __LF                   \
+        add     x2, x5, x2 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x2 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x2 __LF                    \
+        umulh   x4, x4, x2 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x2 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x9, x9, x5 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x2, x2, xzr __LF                   \
+        lsl     x5, x9, #32 __LF                   \
+        add     x9, x5, x9 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x9 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x9 __LF                    \
+        umulh   x4, x4, x9 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x10, x10, x5 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x9, x9, xzr __LF                   \
+        lsl     x5, x10, #32 __LF                  \
+        add     x10, x5, x10 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x10 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x10 __LF                   \
+        umulh   x4, x4, x10 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x10 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x11, x11, x5 __LF                  \
+        sbcs    x12, x12, x4 __LF                  \
+        sbcs    x13, x13, x3 __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        lsl     x5, x11, #32 __LF                  \
+        add     x11, x5, x11 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x11 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x11 __LF                   \
+        umulh   x4, x4, x11 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x12, x12, x5 __LF                  \
+        sbcs    x13, x13, x4 __LF                  \
+        sbcs    x2, x2, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        lsl     x5, x12, #32 __LF                  \
+        add     x12, x5, x12 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x12 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x12 __LF                   \
+        umulh   x4, x4, x12 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x12 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x13, x13, x5 __LF                  \
+        sbcs    x2, x2, x4 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbc     x12, x12, xzr __LF                 \
+        lsl     x5, x13, #32 __LF                  \
+        add     x13, x5, x13 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x13 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x13 __LF                   \
+        umulh   x4, x4, x13 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x13 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x2, x2, x5 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        adds    x2, x2, x14 __LF                   \
+        adcs    x9, x9, x15 __LF                   \
+        adcs    x10, x10, x16 __LF                 \
+        adcs    x11, x11, x17 __LF                 \
+        adcs    x12, x12, x19 __LF                 \
+        adcs    x13, x13, x20 __LF                 \
+        adc     x6, xzr, xzr __LF                  \
+        mov     x8, #-4294967295 __LF              \
+        adds    x14, x2, x8 __LF                   \
+        mov     x8, #4294967295 __LF               \
+        adcs    x15, x9, x8 __LF                   \
+        mov     x8, #1 __LF                        \
+        adcs    x16, x10, x8 __LF                  \
+        adcs    x17, x11, xzr __LF                 \
+        adcs    x19, x12, xzr __LF                 \
+        adcs    x20, x13, xzr __LF                 \
+        adcs    x6, x6, xzr __LF                   \
+        csel    x2, x2, x14, eq __LF               \
+        csel    x9, x9, x15, eq __LF               \
+        csel    x10, x10, x16, eq __LF             \
+        csel    x11, x11, x17, eq __LF             \
+        csel    x12, x12, x19, eq __LF             \
+        csel    x13, x13, x20, eq __LF             \
+        stp     x2, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16] __LF             \
+        stp     x12, x13, [P0+32]
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe). In
+// fact, with the Karatsuba-based Montgomery mul here, we don't even
+// *need* the restriction that the other argument is reduced.
+
+#define amontsqr_p384(P0,P1)                    \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x8, x2, x4 __LF                    \
+        adds    x10, x10, x8 __LF                  \
+        mul     x11, x2, x5 __LF                   \
+        mul     x8, x3, x4 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x8, x3, x5 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        ldp     x6, x7, [P1+32] __LF               \
+        mul     x13, x2, x7 __LF                   \
+        mul     x8, x3, x6 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x14, x2, x7 __LF                   \
+        mul     x8, x3, x7 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x15, x5, x6 __LF                   \
+        adcs    x15, x15, xzr __LF                 \
+        umulh   x16, x5, x6 __LF                   \
+        adc     x16, x16, xzr __LF                 \
+        umulh   x8, x2, x4 __LF                    \
+        adds    x11, x11, x8 __LF                  \
+        umulh   x8, x3, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x3, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x8, x3, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x3, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        adc     x16, x16, xzr __LF                 \
+        mul     x8, x2, x6 __LF                    \
+        adds    x12, x12, x8 __LF                  \
+        mul     x8, x4, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x4, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x8, x4, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x5, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        mul     x17, x6, x7 __LF                   \
+        adcs    x17, x17, xzr __LF                 \
+        umulh   x19, x6, x7 __LF                   \
+        adc     x19, x19, xzr __LF                 \
+        umulh   x8, x2, x6 __LF                    \
+        adds    x13, x13, x8 __LF                  \
+        umulh   x8, x4, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x4, x6 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        umulh   x8, x4, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x5, x7 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        adc     x19, x19, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adcs    x17, x17, x17 __LF                 \
+        adcs    x19, x19, x19 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x8, x2, x2 __LF                    \
+        mul     x2, x2, x2 __LF                    \
+        adds    x9, x9, x8 __LF                    \
+        mul     x8, x3, x3 __LF                    \
+        adcs    x10, x10, x8 __LF                  \
+        umulh   x8, x3, x3 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        mul     x8, x4, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x4, x4 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x5, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x5, x5 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x6, x6 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x6, x6 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        mul     x8, x7, x7 __LF                    \
+        adcs    x19, x19, x8 __LF                  \
+        umulh   x8, x7, x7 __LF                    \
+        adc     x20, x20, x8 __LF                  \
+        lsl     x5, x2, #32 __LF                   \
+        add     x2, x5, x2 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x2 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x2 __LF                    \
+        umulh   x4, x4, x2 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x2 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x9, x9, x5 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x2, x2, xzr __LF                   \
+        lsl     x5, x9, #32 __LF                   \
+        add     x9, x5, x9 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x9 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x9 __LF                    \
+        umulh   x4, x4, x9 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x10, x10, x5 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x9, x9, xzr __LF                   \
+        lsl     x5, x10, #32 __LF                  \
+        add     x10, x5, x10 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x10 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x10 __LF                   \
+        umulh   x4, x4, x10 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x10 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x11, x11, x5 __LF                  \
+        sbcs    x12, x12, x4 __LF                  \
+        sbcs    x13, x13, x3 __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        lsl     x5, x11, #32 __LF                  \
+        add     x11, x5, x11 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x11 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x11 __LF                   \
+        umulh   x4, x4, x11 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x12, x12, x5 __LF                  \
+        sbcs    x13, x13, x4 __LF                  \
+        sbcs    x2, x2, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        lsl     x5, x12, #32 __LF                  \
+        add     x12, x5, x12 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x12 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x12 __LF                   \
+        umulh   x4, x4, x12 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x12 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x13, x13, x5 __LF                  \
+        sbcs    x2, x2, x4 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbc     x12, x12, xzr __LF                 \
+        lsl     x5, x13, #32 __LF                  \
+        add     x13, x5, x13 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x13 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x13 __LF                   \
+        umulh   x4, x4, x13 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x13 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x2, x2, x5 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        adds    x2, x2, x14 __LF                   \
+        adcs    x9, x9, x15 __LF                   \
+        adcs    x10, x10, x16 __LF                 \
+        adcs    x11, x11, x17 __LF                 \
+        adcs    x12, x12, x19 __LF                 \
+        adcs    x13, x13, x20 __LF                 \
+        mov     x14, #-4294967295 __LF             \
+        mov     x15, #4294967295 __LF              \
+        csel    x14, x14, xzr, cs __LF             \
+        csel    x15, x15, xzr, cs __LF             \
+        cset    x16, cs __LF                       \
+        adds    x2, x2, x14 __LF                   \
+        adcs    x9, x9, x15 __LF                   \
+        adcs    x10, x10, x16 __LF                 \
+        adcs    x11, x11, xzr __LF                 \
+        adcs    x12, x12, xzr __LF                 \
+        adc     x13, x13, xzr __LF                 \
+        stp     x2, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16] __LF             \
+        stp     x12, x13, [P0+32]
+
+// Corresponds exactly to bignum_sub_p384
+
+#define sub_p384(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        csetm   x3, lo __LF                        \
+        mov     x4, #4294967295 __LF               \
+        and     x4, x4, x3 __LF                    \
+        adds    x5, x5, x4 __LF                    \
+        eor     x4, x4, x3 __LF                    \
+        adcs    x6, x6, x4 __LF                    \
+        mov     x4, #-2 __LF                       \
+        and     x4, x4, x3 __LF                    \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        adcs    x9, x9, x3 __LF                    \
+        adc     x10, x10, x3 __LF                  \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32]
+
+S2N_BN_SYMBOL(p384_montjadd_alt):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        amontsqr_p384(z1sq,z_1)
+        amontsqr_p384(z2sq,z_2)
+
+        montmul_p384(y1a,z_2,y_1)
+        montmul_p384(y2a,z_1,y_2)
+
+        montmul_p384(x2a,z1sq,x_2)
+        montmul_p384(x1a,z2sq,x_1)
+        montmul_p384(y2a,z1sq,y2a)
+        montmul_p384(y1a,z2sq,y1a)
+
+        sub_p384(xd,x2a,x1a)
+        sub_p384(yd,y2a,y1a)
+
+        amontsqr_p384(zz,xd)
+        montsqr_p384(ww,yd)
+
+        montmul_p384(zzx1,zz,x1a)
+        montmul_p384(zzx2,zz,x2a)
+
+        sub_p384(resx,ww,zzx1)
+        sub_p384(t1,zzx2,zzx1)
+
+        montmul_p384(xd,xd,z_1)
+
+        sub_p384(resx,resx,zzx2)
+
+        sub_p384(t2,zzx1,resx)
+
+        montmul_p384(t1,t1,y1a)
+        montmul_p384(resz,xd,z_2)
+        montmul_p384(t2,yd,t2)
+
+        sub_p384(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
+// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        ldp     x4, x5, [z_1+32]
+
+        orr     x20, x0, x1
+        orr     x21, x2, x3
+        orr     x22, x4, x5
+        orr     x20, x20, x21
+        orr     x20, x20, x22
+        cmp     x20, xzr
+        cset    x20, ne
+
+        ldp     x6, x7, [z_2]
+        ldp     x8, x9, [z_2+16]
+        ldp     x10, x11, [z_2+32]
+
+        orr     x21, x6, x7
+        orr     x22, x8, x9
+        orr     x23, x10, x11
+        orr     x21, x21, x22
+        orr     x21, x21, x23
+        cmp     x21, xzr
+        cset    x21, ne
+
+        cmp     x21, x20
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        ldp     x12, x13, [resz]
+        csel    x12, x0, x12, lo
+        csel    x13, x1, x13, lo
+        csel    x12, x6, x12, hi
+        csel    x13, x7, x13, hi
+        ldp     x14, x15, [resz+16]
+        csel    x14, x2, x14, lo
+        csel    x15, x3, x15, lo
+        csel    x14, x8, x14, hi
+        csel    x15, x9, x15, hi
+        ldp     x16, x17, [resz+32]
+        csel    x16, x4, x16, lo
+        csel    x17, x5, x17, lo
+        csel    x16, x10, x16, hi
+        csel    x17, x11, x17, hi
+
+        ldp     x20, x21, [x_1]
+        ldp     x0, x1, [resx]
+        csel    x0, x20, x0, lo
+        csel    x1, x21, x1, lo
+        ldp     x20, x21, [x_2]
+        csel    x0, x20, x0, hi
+        csel    x1, x21, x1, hi
+
+        ldp     x20, x21, [x_1+16]
+        ldp     x2, x3, [resx+16]
+        csel    x2, x20, x2, lo
+        csel    x3, x21, x3, lo
+        ldp     x20, x21, [x_2+16]
+        csel    x2, x20, x2, hi
+        csel    x3, x21, x3, hi
+
+        ldp     x20, x21, [x_1+32]
+        ldp     x4, x5, [resx+32]
+        csel    x4, x20, x4, lo
+        csel    x5, x21, x5, lo
+        ldp     x20, x21, [x_2+32]
+        csel    x4, x20, x4, hi
+        csel    x5, x21, x5, hi
+
+        ldp     x20, x21, [y_1]
+        ldp     x6, x7, [resy]
+        csel    x6, x20, x6, lo
+        csel    x7, x21, x7, lo
+        ldp     x20, x21, [y_2]
+        csel    x6, x20, x6, hi
+        csel    x7, x21, x7, hi
+
+        ldp     x20, x21, [y_1+16]
+        ldp     x8, x9, [resy+16]
+        csel    x8, x20, x8, lo
+        csel    x9, x21, x9, lo
+        ldp     x20, x21, [y_2+16]
+        csel    x8, x20, x8, hi
+        csel    x9, x21, x9, hi
+
+        ldp     x20, x21, [y_1+32]
+        ldp     x10, x11, [resy+32]
+        csel    x10, x20, x10, lo
+        csel    x11, x21, x11, lo
+        ldp     x20, x21, [y_2+32]
+        csel    x10, x20, x10, hi
+        csel    x11, x21, x11, hi
+
+// Finally store back the multiplexed values
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [x_3+32]
+        stp     x6, x7, [y_3]
+        stp     x8, x9, [y_3+16]
+        stp     x10, x11, [y_3+32]
+        stp     x12, x13, [z_3]
+        stp     x14, x15, [z_3+16]
+        stp     x16, x17, [z_3+32]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+
+        ldp     x25, x26, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p384/p384_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p384/p384_montjdouble.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble_alt.S
new file mode 100644
index 00000000000..c8a96fbba44
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble_alt.S
@@ -0,0 +1,951 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates
+//
+//    extern void p384_montjdouble_alt
+//      (uint64_t p3[static 18],uint64_t p1[static 18]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble_alt)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 48
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x23
+#define input_x x24
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z2 sp, #(NUMSIZE*0)
+#define y2 sp, #(NUMSIZE*1)
+#define x2p sp, #(NUMSIZE*2)
+#define xy2 sp, #(NUMSIZE*3)
+
+#define y4 sp, #(NUMSIZE*4)
+#define t2 sp, #(NUMSIZE*4)
+
+#define dx2 sp, #(NUMSIZE*5)
+#define t1 sp, #(NUMSIZE*5)
+
+#define d sp, #(NUMSIZE*6)
+#define x4p sp, #(NUMSIZE*6)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds exactly to bignum_montmul_p384_alt
+
+#define montmul_p384(P0,P1,P2)                  \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x12, x3, x5 __LF                   \
+        umulh   x13, x3, x5 __LF                   \
+        mul     x11, x3, x6 __LF                   \
+        umulh   x14, x3, x6 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x7, x8, [P2+16] __LF               \
+        mul     x11, x3, x7 __LF                   \
+        umulh   x15, x3, x7 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x16, x3, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        ldp     x9, x10, [P2+32] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x17, x3, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x19, x3, x10 __LF                  \
+        adcs    x17, x17, x11 __LF                 \
+        adc     x19, x19, xzr __LF                 \
+        mul     x11, x4, x5 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x6 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x7 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x19, x19, x11 __LF                 \
+        cset    x20, cs __LF                       \
+        umulh   x11, x4, x5 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x6 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x7 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x4, x10 __LF                  \
+        adc     x20, x20, x11 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x3, x6 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x3, x7 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x3, x8 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x3, x9 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        adcs    x20, x20, x11 __LF                 \
+        cset    x21, cs __LF                       \
+        umulh   x11, x3, x5 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x3, x6 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x3, x7 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        umulh   x11, x3, x8 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x3, x9 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x3, x10 __LF                  \
+        adc     x21, x21, x11 __LF                 \
+        mul     x11, x4, x5 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x4, x6 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x4, x7 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x21, x21, x11 __LF                 \
+        cset    x22, cs __LF                       \
+        umulh   x11, x4, x5 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x4, x6 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        umulh   x11, x4, x7 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        umulh   x11, x4, x10 __LF                  \
+        adc     x22, x22, x11 __LF                 \
+        ldp     x3, x4, [P1+32] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        mul     x11, x3, x6 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x3, x7 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x3, x8 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        mul     x11, x3, x9 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        adcs    x22, x22, x11 __LF                 \
+        cset    x2, cs __LF                        \
+        umulh   x11, x3, x5 __LF                   \
+        adds    x17, x17, x11 __LF                 \
+        umulh   x11, x3, x6 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x3, x7 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x3, x8 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        umulh   x11, x3, x9 __LF                   \
+        adcs    x22, x22, x11 __LF                 \
+        umulh   x11, x3, x10 __LF                  \
+        adc     x2, x2, x11 __LF                   \
+        mul     x11, x4, x5 __LF                   \
+        adds    x17, x17, x11 __LF                 \
+        mul     x11, x4, x6 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x4, x7 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x22, x22, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x2, x2, x11 __LF                   \
+        cset    x1, cs __LF                        \
+        umulh   x11, x4, x5 __LF                   \
+        adds    x19, x19, x11 __LF                 \
+        umulh   x11, x4, x6 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x4, x7 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x22, x22, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x2, x2, x11 __LF                   \
+        umulh   x11, x4, x10 __LF                  \
+        adc     x1, x1, x11 __LF                   \
+        lsl     x7, x12, #32 __LF                  \
+        add     x12, x7, x12 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x12 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x12 __LF                   \
+        umulh   x6, x6, x12 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x12 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x13, x13, x7 __LF                  \
+        sbcs    x14, x14, x6 __LF                  \
+        sbcs    x15, x15, x5 __LF                  \
+        sbcs    x16, x16, xzr __LF                 \
+        sbcs    x17, x17, xzr __LF                 \
+        sbc     x12, x12, xzr __LF                 \
+        lsl     x7, x13, #32 __LF                  \
+        add     x13, x7, x13 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x13 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x13 __LF                   \
+        umulh   x6, x6, x13 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x13 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x14, x14, x7 __LF                  \
+        sbcs    x15, x15, x6 __LF                  \
+        sbcs    x16, x16, x5 __LF                  \
+        sbcs    x17, x17, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        lsl     x7, x14, #32 __LF                  \
+        add     x14, x7, x14 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x14 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x14 __LF                   \
+        umulh   x6, x6, x14 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x14 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x15, x15, x7 __LF                  \
+        sbcs    x16, x16, x6 __LF                  \
+        sbcs    x17, x17, x5 __LF                  \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x14, x14, xzr __LF                 \
+        lsl     x7, x15, #32 __LF                  \
+        add     x15, x7, x15 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x15 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x15 __LF                   \
+        umulh   x6, x6, x15 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x15 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x16, x16, x7 __LF                  \
+        sbcs    x17, x17, x6 __LF                  \
+        sbcs    x12, x12, x5 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        lsl     x7, x16, #32 __LF                  \
+        add     x16, x7, x16 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x16 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x16 __LF                   \
+        umulh   x6, x6, x16 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x16 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x17, x17, x7 __LF                  \
+        sbcs    x12, x12, x6 __LF                  \
+        sbcs    x13, x13, x5 __LF                  \
+        sbcs    x14, x14, xzr __LF                 \
+        sbcs    x15, x15, xzr __LF                 \
+        sbc     x16, x16, xzr __LF                 \
+        lsl     x7, x17, #32 __LF                  \
+        add     x17, x7, x17 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x17 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x17 __LF                   \
+        umulh   x6, x6, x17 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x17 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, x6 __LF                  \
+        sbcs    x14, x14, x5 __LF                  \
+        sbcs    x15, x15, xzr __LF                 \
+        sbcs    x16, x16, xzr __LF                 \
+        sbc     x17, x17, xzr __LF                 \
+        adds    x12, x12, x19 __LF                 \
+        adcs    x13, x13, x20 __LF                 \
+        adcs    x14, x14, x21 __LF                 \
+        adcs    x15, x15, x22 __LF                 \
+        adcs    x16, x16, x2 __LF                  \
+        adcs    x17, x17, x1 __LF                  \
+        adc     x10, xzr, xzr __LF                 \
+        mov     x11, #0xffffffff00000001 __LF      \
+        adds    x19, x12, x11 __LF                 \
+        mov     x11, #0xffffffff __LF              \
+        adcs    x20, x13, x11 __LF                 \
+        mov     x11, #0x1 __LF                     \
+        adcs    x21, x14, x11 __LF                 \
+        adcs    x22, x15, xzr __LF                 \
+        adcs    x2, x16, xzr __LF                  \
+        adcs    x1, x17, xzr __LF                  \
+        adcs    x10, x10, xzr __LF                 \
+        csel    x12, x12, x19, eq __LF             \
+        csel    x13, x13, x20, eq __LF             \
+        csel    x14, x14, x21, eq __LF             \
+        csel    x15, x15, x22, eq __LF             \
+        csel    x16, x16, x2, eq __LF              \
+        csel    x17, x17, x1, eq __LF              \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16] __LF             \
+        stp     x16, x17, [P0+32]
+
+// Corresponds exactly to bignum_montsqr_p384_alt
+
+#define montsqr_p384(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x8, x2, x4 __LF                    \
+        adds    x10, x10, x8 __LF                  \
+        mul     x11, x2, x5 __LF                   \
+        mul     x8, x3, x4 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x8, x3, x5 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        ldp     x6, x7, [P1+32] __LF               \
+        mul     x13, x2, x7 __LF                   \
+        mul     x8, x3, x6 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x14, x2, x7 __LF                   \
+        mul     x8, x3, x7 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x15, x5, x6 __LF                   \
+        adcs    x15, x15, xzr __LF                 \
+        umulh   x16, x5, x6 __LF                   \
+        adc     x16, x16, xzr __LF                 \
+        umulh   x8, x2, x4 __LF                    \
+        adds    x11, x11, x8 __LF                  \
+        umulh   x8, x3, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x3, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x8, x3, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x3, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        adc     x16, x16, xzr __LF                 \
+        mul     x8, x2, x6 __LF                    \
+        adds    x12, x12, x8 __LF                  \
+        mul     x8, x4, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x4, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x8, x4, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x5, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        mul     x17, x6, x7 __LF                   \
+        adcs    x17, x17, xzr __LF                 \
+        umulh   x19, x6, x7 __LF                   \
+        adc     x19, x19, xzr __LF                 \
+        umulh   x8, x2, x6 __LF                    \
+        adds    x13, x13, x8 __LF                  \
+        umulh   x8, x4, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x4, x6 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        umulh   x8, x4, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x5, x7 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        adc     x19, x19, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adcs    x17, x17, x17 __LF                 \
+        adcs    x19, x19, x19 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x8, x2, x2 __LF                    \
+        mul     x2, x2, x2 __LF                    \
+        adds    x9, x9, x8 __LF                    \
+        mul     x8, x3, x3 __LF                    \
+        adcs    x10, x10, x8 __LF                  \
+        umulh   x8, x3, x3 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        mul     x8, x4, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x4, x4 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x5, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x5, x5 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x6, x6 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x6, x6 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        mul     x8, x7, x7 __LF                    \
+        adcs    x19, x19, x8 __LF                  \
+        umulh   x8, x7, x7 __LF                    \
+        adc     x20, x20, x8 __LF                  \
+        lsl     x5, x2, #32 __LF                   \
+        add     x2, x5, x2 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x2 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x2 __LF                    \
+        umulh   x4, x4, x2 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x2 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x9, x9, x5 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x2, x2, xzr __LF                   \
+        lsl     x5, x9, #32 __LF                   \
+        add     x9, x5, x9 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x9 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x9 __LF                    \
+        umulh   x4, x4, x9 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x10, x10, x5 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x9, x9, xzr __LF                   \
+        lsl     x5, x10, #32 __LF                  \
+        add     x10, x5, x10 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x10 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x10 __LF                   \
+        umulh   x4, x4, x10 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x10 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x11, x11, x5 __LF                  \
+        sbcs    x12, x12, x4 __LF                  \
+        sbcs    x13, x13, x3 __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        lsl     x5, x11, #32 __LF                  \
+        add     x11, x5, x11 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x11 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x11 __LF                   \
+        umulh   x4, x4, x11 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x12, x12, x5 __LF                  \
+        sbcs    x13, x13, x4 __LF                  \
+        sbcs    x2, x2, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        lsl     x5, x12, #32 __LF                  \
+        add     x12, x5, x12 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x12 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x12 __LF                   \
+        umulh   x4, x4, x12 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x12 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x13, x13, x5 __LF                  \
+        sbcs    x2, x2, x4 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbc     x12, x12, xzr __LF                 \
+        lsl     x5, x13, #32 __LF                  \
+        add     x13, x5, x13 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x13 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x13 __LF                   \
+        umulh   x4, x4, x13 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x13 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x2, x2, x5 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        adds    x2, x2, x14 __LF                   \
+        adcs    x9, x9, x15 __LF                   \
+        adcs    x10, x10, x16 __LF                 \
+        adcs    x11, x11, x17 __LF                 \
+        adcs    x12, x12, x19 __LF                 \
+        adcs    x13, x13, x20 __LF                 \
+        adc     x6, xzr, xzr __LF                  \
+        mov     x8, #-4294967295 __LF              \
+        adds    x14, x2, x8 __LF                   \
+        mov     x8, #4294967295 __LF               \
+        adcs    x15, x9, x8 __LF                   \
+        mov     x8, #1 __LF                        \
+        adcs    x16, x10, x8 __LF                  \
+        adcs    x17, x11, xzr __LF                 \
+        adcs    x19, x12, xzr __LF                 \
+        adcs    x20, x13, xzr __LF                 \
+        adcs    x6, x6, xzr __LF                   \
+        csel    x2, x2, x14, eq __LF               \
+        csel    x9, x9, x15, eq __LF               \
+        csel    x10, x10, x16, eq __LF             \
+        csel    x11, x11, x17, eq __LF             \
+        csel    x12, x12, x19, eq __LF             \
+        csel    x13, x13, x20, eq __LF             \
+        stp     x2, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16] __LF             \
+        stp     x12, x13, [P0+32]
+
+// Corresponds exactly to bignum_sub_p384
+
+#define sub_p384(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        csetm   x3, lo __LF                        \
+        mov     x4, #4294967295 __LF               \
+        and     x4, x4, x3 __LF                    \
+        adds    x5, x5, x4 __LF                    \
+        eor     x4, x4, x3 __LF                    \
+        adcs    x6, x6, x4 __LF                    \
+        mov     x4, #-2 __LF                       \
+        and     x4, x4, x3 __LF                    \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        adcs    x9, x9, x3 __LF                    \
+        adc     x10, x10, x3 __LF                  \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32]
+
+// Corresponds exactly to bignum_add_p384
+
+#define add_p384(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        adds    x5, x5, x4 __LF                    \
+        adcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x3 __LF                  \
+        adc     x3, xzr, xzr __LF                  \
+        mov     x4, #0xffffffff __LF               \
+        cmp     x5, x4 __LF                        \
+        mov     x4, #0xffffffff00000000 __LF       \
+        sbcs    xzr, x6, x4 __LF                   \
+        mov     x4, #0xfffffffffffffffe __LF       \
+        sbcs    xzr, x7, x4 __LF                   \
+        adcs    xzr, x8, xzr __LF                  \
+        adcs    xzr, x9, xzr __LF                  \
+        adcs    xzr, x10, xzr __LF                 \
+        adcs    x3, x3, xzr __LF                   \
+        csetm   x3, ne __LF                        \
+        mov     x4, #0xffffffff __LF               \
+        and     x4, x4, x3 __LF                    \
+        subs    x5, x5, x4 __LF                    \
+        eor     x4, x4, x3 __LF                    \
+        sbcs    x6, x6, x4 __LF                    \
+        mov     x4, #0xfffffffffffffffe __LF       \
+        and     x4, x4, x3 __LF                    \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbc     x10, x10, x3 __LF                  \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32]
+
+// P0 = 4 * P1 - P2
+
+#define cmsub41_p384(P0,P1,P2)                  \
+        ldp     x1, x2, [P1] __LF                  \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P1+32] __LF               \
+        lsl     x0, x1, #2 __LF                    \
+        ldp     x7, x8, [P2] __LF                  \
+        subs    x0, x0, x7 __LF                    \
+        extr    x1, x2, x1, #62 __LF               \
+        sbcs    x1, x1, x8 __LF                    \
+        ldp     x7, x8, [P2+16] __LF               \
+        extr    x2, x3, x2, #62 __LF               \
+        sbcs    x2, x2, x7 __LF                    \
+        extr    x3, x4, x3, #62 __LF               \
+        sbcs    x3, x3, x8 __LF                    \
+        extr    x4, x5, x4, #62 __LF               \
+        ldp     x7, x8, [P2+32] __LF               \
+        sbcs    x4, x4, x7 __LF                    \
+        extr    x5, x6, x5, #62 __LF               \
+        sbcs    x5, x5, x8 __LF                    \
+        lsr     x6, x6, #62 __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        lsl     x7, x6, #32 __LF                   \
+        subs    x8, x6, x7 __LF                    \
+        sbc     x7, x7, xzr __LF                   \
+        adds    x0, x0, x8 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        adcs    x2, x2, x6 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        csetm   x8, cc __LF                        \
+        mov     x9, #0xffffffff __LF               \
+        and     x9, x9, x8 __LF                    \
+        adds    x0, x0, x9 __LF                    \
+        eor     x9, x9, x8 __LF                    \
+        adcs    x1, x1, x9 __LF                    \
+        mov     x9, #0xfffffffffffffffe __LF       \
+        and     x9, x9, x8 __LF                    \
+        adcs    x2, x2, x9 __LF                    \
+        adcs    x3, x3, x8 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        adc     x5, x5, x8 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16] __LF               \
+        stp     x4, x5, [P0+32]
+
+// P0 = C * P1 - D * P2
+
+#define cmsub_p384(P0,C,P1,D,P2)                \
+        ldp     x0, x1, [P2] __LF                  \
+        mov     x6, #0x00000000ffffffff __LF       \
+        subs    x6, x6, x0 __LF                    \
+        mov     x7, #0xffffffff00000000 __LF       \
+        sbcs    x7, x7, x1 __LF                    \
+        ldp     x0, x1, [P2+16] __LF               \
+        mov     x8, #0xfffffffffffffffe __LF       \
+        sbcs    x8, x8, x0 __LF                    \
+        mov     x13, #0xffffffffffffffff __LF      \
+        sbcs    x9, x13, x1 __LF                   \
+        ldp     x0, x1, [P2+32] __LF               \
+        sbcs    x10, x13, x0 __LF                  \
+        sbc     x11, x13, x1 __LF                  \
+        mov     x12, D __LF                        \
+        mul     x0, x12, x6 __LF                   \
+        mul     x1, x12, x7 __LF                   \
+        mul     x2, x12, x8 __LF                   \
+        mul     x3, x12, x9 __LF                   \
+        mul     x4, x12, x10 __LF                  \
+        mul     x5, x12, x11 __LF                  \
+        umulh   x6, x12, x6 __LF                   \
+        umulh   x7, x12, x7 __LF                   \
+        umulh   x8, x12, x8 __LF                   \
+        umulh   x9, x12, x9 __LF                   \
+        umulh   x10, x12, x10 __LF                 \
+        umulh   x12, x12, x11 __LF                 \
+        adds    x1, x1, x6 __LF                    \
+        adcs    x2, x2, x7 __LF                    \
+        adcs    x3, x3, x8 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        mov     x6, #1 __LF                        \
+        adc     x6, x12, x6 __LF                   \
+        ldp     x8, x9, [P1] __LF                  \
+        ldp     x10, x11, [P1+16] __LF             \
+        ldp     x12, x13, [P1+32] __LF             \
+        mov     x14, C __LF                        \
+        mul     x15, x14, x8 __LF                  \
+        umulh   x8, x14, x8 __LF                   \
+        adds    x0, x0, x15 __LF                   \
+        mul     x15, x14, x9 __LF                  \
+        umulh   x9, x14, x9 __LF                   \
+        adcs    x1, x1, x15 __LF                   \
+        mul     x15, x14, x10 __LF                 \
+        umulh   x10, x14, x10 __LF                 \
+        adcs    x2, x2, x15 __LF                   \
+        mul     x15, x14, x11 __LF                 \
+        umulh   x11, x14, x11 __LF                 \
+        adcs    x3, x3, x15 __LF                   \
+        mul     x15, x14, x12 __LF                 \
+        umulh   x12, x14, x12 __LF                 \
+        adcs    x4, x4, x15 __LF                   \
+        mul     x15, x14, x13 __LF                 \
+        umulh   x13, x14, x13 __LF                 \
+        adcs    x5, x5, x15 __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        adds    x1, x1, x8 __LF                    \
+        adcs    x2, x2, x9 __LF                    \
+        adcs    x3, x3, x10 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adcs    x5, x5, x12 __LF                   \
+        adcs    x6, x6, x13 __LF                   \
+        lsl     x7, x6, #32 __LF                   \
+        subs    x8, x6, x7 __LF                    \
+        sbc     x7, x7, xzr __LF                   \
+        adds    x0, x0, x8 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        adcs    x2, x2, x6 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        csetm   x6, cc __LF                        \
+        mov     x7, #0xffffffff __LF               \
+        and     x7, x7, x6 __LF                    \
+        adds    x0, x0, x7 __LF                    \
+        eor     x7, x7, x6 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        mov     x7, #0xfffffffffffffffe __LF       \
+        and     x7, x7, x6 __LF                    \
+        adcs    x2, x2, x7 __LF                    \
+        adcs    x3, x3, x6 __LF                    \
+        adcs    x4, x4, x6 __LF                    \
+        adc     x5, x5, x6 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16] __LF               \
+        stp     x4, x5, [P0+32]
+
+// A weak version of add that only guarantees sum in 6 digits
+
+#define weakadd_p384(P0,P1,P2)                  \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        adds    x5, x5, x4 __LF                    \
+        adcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x3 __LF                  \
+        csetm   x3, cs __LF                        \
+        mov     x4, #0xffffffff __LF               \
+        and     x4, x4, x3 __LF                    \
+        subs    x5, x5, x4 __LF                    \
+        eor     x4, x4, x3 __LF                    \
+        sbcs    x6, x6, x4 __LF                    \
+        mov     x4, #0xfffffffffffffffe __LF       \
+        and     x4, x4, x3 __LF                    \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbc     x10, x10, x3 __LF                  \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32]
+
+// P0 = 3 * P1 - 8 * P2
+
+#define cmsub38_p384(P0,P1,P2)                  \
+        ldp     x0, x1, [P2] __LF                  \
+        mov     x6, #0x00000000ffffffff __LF       \
+        subs    x6, x6, x0 __LF                    \
+        mov     x7, #0xffffffff00000000 __LF       \
+        sbcs    x7, x7, x1 __LF                    \
+        ldp     x0, x1, [P2+16] __LF               \
+        mov     x8, #0xfffffffffffffffe __LF       \
+        sbcs    x8, x8, x0 __LF                    \
+        mov     x13, #0xffffffffffffffff __LF      \
+        sbcs    x9, x13, x1 __LF                   \
+        ldp     x0, x1, [P2+32] __LF               \
+        sbcs    x10, x13, x0 __LF                  \
+        sbc     x11, x13, x1 __LF                  \
+        lsl     x0, x6, #3 __LF                    \
+        extr    x1, x7, x6, #61 __LF               \
+        extr    x2, x8, x7, #61 __LF               \
+        extr    x3, x9, x8, #61 __LF               \
+        extr    x4, x10, x9, #61 __LF              \
+        extr    x5, x11, x10, #61 __LF             \
+        lsr     x6, x11, #61 __LF                  \
+        add     x6, x6, #1 __LF                    \
+        ldp     x8, x9, [P1] __LF                  \
+        ldp     x10, x11, [P1+16] __LF             \
+        ldp     x12, x13, [P1+32] __LF             \
+        mov     x14, 3 __LF                        \
+        mul     x15, x14, x8 __LF                  \
+        umulh   x8, x14, x8 __LF                   \
+        adds    x0, x0, x15 __LF                   \
+        mul     x15, x14, x9 __LF                  \
+        umulh   x9, x14, x9 __LF                   \
+        adcs    x1, x1, x15 __LF                   \
+        mul     x15, x14, x10 __LF                 \
+        umulh   x10, x14, x10 __LF                 \
+        adcs    x2, x2, x15 __LF                   \
+        mul     x15, x14, x11 __LF                 \
+        umulh   x11, x14, x11 __LF                 \
+        adcs    x3, x3, x15 __LF                   \
+        mul     x15, x14, x12 __LF                 \
+        umulh   x12, x14, x12 __LF                 \
+        adcs    x4, x4, x15 __LF                   \
+        mul     x15, x14, x13 __LF                 \
+        umulh   x13, x14, x13 __LF                 \
+        adcs    x5, x5, x15 __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        adds    x1, x1, x8 __LF                    \
+        adcs    x2, x2, x9 __LF                    \
+        adcs    x3, x3, x10 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adcs    x5, x5, x12 __LF                   \
+        adcs    x6, x6, x13 __LF                   \
+        lsl     x7, x6, #32 __LF                   \
+        subs    x8, x6, x7 __LF                    \
+        sbc     x7, x7, xzr __LF                   \
+        adds    x0, x0, x8 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        adcs    x2, x2, x6 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        csetm   x6, cc __LF                        \
+        mov     x7, #0xffffffff __LF               \
+        and     x7, x7, x6 __LF                    \
+        adds    x0, x0, x7 __LF                    \
+        eor     x7, x7, x6 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        mov     x7, #0xfffffffffffffffe __LF       \
+        and     x7, x7, x6 __LF                    \
+        adcs    x2, x2, x7 __LF                    \
+        adcs    x3, x3, x6 __LF                    \
+        adcs    x4, x4, x6 __LF                    \
+        adc     x5, x5, x6 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16] __LF               \
+        stp     x4, x5, [P0+32]
+
+S2N_BN_SYMBOL(p384_montjdouble_alt):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+
+// Main code, just a sequence of basic field operations
+
+// z2 = z^2
+// y2 = y^2
+
+        montsqr_p384(z2,z_1)
+        montsqr_p384(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        weakadd_p384(t1,x_1,z2)
+        sub_p384(t2,x_1,z2)
+        montmul_p384(x2p,t1,t2)
+
+// t1 = y + z
+// x4p = x2p^2
+// xy2 = x * y^2
+
+        add_p384(t1,y_1,z_1)
+        montsqr_p384(x4p,x2p)
+        montmul_p384(xy2,x_1,y2)
+
+// t2 = (y + z)^2
+
+        montsqr_p384(t2,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_p384(d,12,xy2,9,x4p)
+        sub_p384(t1,t2,z2)
+
+// y4 = y^4
+
+        montsqr_p384(y4,y2)
+
+// z_3' = 2 * y * z
+// dx2 = d * x2p
+
+        sub_p384(z_3,t1,y2)
+        montmul_p384(dx2,d,x2p)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_p384(x_3,xy2,d)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_p384(y_3,dx2,y4)
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd.S
new file mode 100644
index 00000000000..6c7b121fd84
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd.S
@@ -0,0 +1,876 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates
+//
+//    extern void p384_montjmixadd
+//      (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 48
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x24
+#define input_x x25
+#define input_y x26
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_montmul_p384 except x24 -> x0
+
+#define montmul_p384(P0,P1,P2)                  \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P1+32] __LF               \
+        ldp     x9, x10, [P2] __LF                 \
+        ldp     x11, x12, [P2+16] __LF             \
+        ldp     x13, x14, [P2+32] __LF             \
+        mul     x15, x3, x9 __LF                   \
+        mul     x21, x4, x10 __LF                  \
+        mul     x22, x5, x11 __LF                  \
+        umulh   x23, x3, x9 __LF                   \
+        umulh   x0, x4, x10 __LF                   \
+        umulh   x1, x5, x11 __LF                   \
+        adds    x23, x23, x21 __LF                 \
+        adcs    x0, x0, x22 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        adds    x16, x23, x15 __LF                 \
+        adcs    x17, x0, x23 __LF                  \
+        adcs    x19, x1, x0 __LF                   \
+        adc     x20, x1, xzr __LF                  \
+        adds    x17, x17, x15 __LF                 \
+        adcs    x19, x19, x23 __LF                 \
+        adcs    x20, x20, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x0, x3, x4 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        csetm   x23, lo __LF                       \
+        subs    x22, x10, x9 __LF                  \
+        cneg    x22, x22, lo __LF                  \
+        mul     x21, x0, x22 __LF                  \
+        umulh   x22, x0, x22 __LF                  \
+        cinv    x23, x23, lo __LF                  \
+        eor     x21, x21, x23 __LF                 \
+        eor     x22, x22, x23 __LF                 \
+        cmn     x23, #1 __LF                       \
+        adcs    x16, x16, x21 __LF                 \
+        adcs    x17, x17, x22 __LF                 \
+        adcs    x19, x19, x23 __LF                 \
+        adcs    x20, x20, x23 __LF                 \
+        adc     x1, x1, x23 __LF                   \
+        subs    x0, x3, x5 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        csetm   x23, lo __LF                       \
+        subs    x22, x11, x9 __LF                  \
+        cneg    x22, x22, lo __LF                  \
+        mul     x21, x0, x22 __LF                  \
+        umulh   x22, x0, x22 __LF                  \
+        cinv    x23, x23, lo __LF                  \
+        eor     x21, x21, x23 __LF                 \
+        eor     x22, x22, x23 __LF                 \
+        cmn     x23, #1 __LF                       \
+        adcs    x17, x17, x21 __LF                 \
+        adcs    x19, x19, x22 __LF                 \
+        adcs    x20, x20, x23 __LF                 \
+        adc     x1, x1, x23 __LF                   \
+        subs    x0, x4, x5 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        csetm   x23, lo __LF                       \
+        subs    x22, x11, x10 __LF                 \
+        cneg    x22, x22, lo __LF                  \
+        mul     x21, x0, x22 __LF                  \
+        umulh   x22, x0, x22 __LF                  \
+        cinv    x23, x23, lo __LF                  \
+        eor     x21, x21, x23 __LF                 \
+        eor     x22, x22, x23 __LF                 \
+        cmn     x23, #1 __LF                       \
+        adcs    x19, x19, x21 __LF                 \
+        adcs    x20, x20, x22 __LF                 \
+        adc     x1, x1, x23 __LF                   \
+        lsl     x23, x15, #32 __LF                 \
+        add     x15, x23, x15 __LF                 \
+        lsr     x23, x15, #32 __LF                 \
+        subs    x23, x23, x15 __LF                 \
+        sbc     x22, x15, xzr __LF                 \
+        extr    x23, x22, x23, #32 __LF            \
+        lsr     x22, x22, #32 __LF                 \
+        adds    x22, x22, x15 __LF                 \
+        adc     x21, xzr, xzr __LF                 \
+        subs    x16, x16, x23 __LF                 \
+        sbcs    x17, x17, x22 __LF                 \
+        sbcs    x19, x19, x21 __LF                 \
+        sbcs    x20, x20, xzr __LF                 \
+        sbcs    x1, x1, xzr __LF                   \
+        sbc     x15, x15, xzr __LF                 \
+        lsl     x23, x16, #32 __LF                 \
+        add     x16, x23, x16 __LF                 \
+        lsr     x23, x16, #32 __LF                 \
+        subs    x23, x23, x16 __LF                 \
+        sbc     x22, x16, xzr __LF                 \
+        extr    x23, x22, x23, #32 __LF            \
+        lsr     x22, x22, #32 __LF                 \
+        adds    x22, x22, x16 __LF                 \
+        adc     x21, xzr, xzr __LF                 \
+        subs    x17, x17, x23 __LF                 \
+        sbcs    x19, x19, x22 __LF                 \
+        sbcs    x20, x20, x21 __LF                 \
+        sbcs    x1, x1, xzr __LF                   \
+        sbcs    x15, x15, xzr __LF                 \
+        sbc     x16, x16, xzr __LF                 \
+        lsl     x23, x17, #32 __LF                 \
+        add     x17, x23, x17 __LF                 \
+        lsr     x23, x17, #32 __LF                 \
+        subs    x23, x23, x17 __LF                 \
+        sbc     x22, x17, xzr __LF                 \
+        extr    x23, x22, x23, #32 __LF            \
+        lsr     x22, x22, #32 __LF                 \
+        adds    x22, x22, x17 __LF                 \
+        adc     x21, xzr, xzr __LF                 \
+        subs    x19, x19, x23 __LF                 \
+        sbcs    x20, x20, x22 __LF                 \
+        sbcs    x1, x1, x21 __LF                   \
+        sbcs    x15, x15, xzr __LF                 \
+        sbcs    x16, x16, xzr __LF                 \
+        sbc     x17, x17, xzr __LF                 \
+        stp     x19, x20, [P0] __LF                \
+        stp     x1, x15, [P0+16] __LF              \
+        stp     x16, x17, [P0+32] __LF             \
+        mul     x15, x6, x12 __LF                  \
+        mul     x21, x7, x13 __LF                  \
+        mul     x22, x8, x14 __LF                  \
+        umulh   x23, x6, x12 __LF                  \
+        umulh   x0, x7, x13 __LF                   \
+        umulh   x1, x8, x14 __LF                   \
+        adds    x23, x23, x21 __LF                 \
+        adcs    x0, x0, x22 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        adds    x16, x23, x15 __LF                 \
+        adcs    x17, x0, x23 __LF                  \
+        adcs    x19, x1, x0 __LF                   \
+        adc     x20, x1, xzr __LF                  \
+        adds    x17, x17, x15 __LF                 \
+        adcs    x19, x19, x23 __LF                 \
+        adcs    x20, x20, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x0, x6, x7 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        csetm   x23, lo __LF                       \
+        subs    x22, x13, x12 __LF                 \
+        cneg    x22, x22, lo __LF                  \
+        mul     x21, x0, x22 __LF                  \
+        umulh   x22, x0, x22 __LF                  \
+        cinv    x23, x23, lo __LF                  \
+        eor     x21, x21, x23 __LF                 \
+        eor     x22, x22, x23 __LF                 \
+        cmn     x23, #1 __LF                       \
+        adcs    x16, x16, x21 __LF                 \
+        adcs    x17, x17, x22 __LF                 \
+        adcs    x19, x19, x23 __LF                 \
+        adcs    x20, x20, x23 __LF                 \
+        adc     x1, x1, x23 __LF                   \
+        subs    x0, x6, x8 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        csetm   x23, lo __LF                       \
+        subs    x22, x14, x12 __LF                 \
+        cneg    x22, x22, lo __LF                  \
+        mul     x21, x0, x22 __LF                  \
+        umulh   x22, x0, x22 __LF                  \
+        cinv    x23, x23, lo __LF                  \
+        eor     x21, x21, x23 __LF                 \
+        eor     x22, x22, x23 __LF                 \
+        cmn     x23, #1 __LF                       \
+        adcs    x17, x17, x21 __LF                 \
+        adcs    x19, x19, x22 __LF                 \
+        adcs    x20, x20, x23 __LF                 \
+        adc     x1, x1, x23 __LF                   \
+        subs    x0, x7, x8 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        csetm   x23, lo __LF                       \
+        subs    x22, x14, x13 __LF                 \
+        cneg    x22, x22, lo __LF                  \
+        mul     x21, x0, x22 __LF                  \
+        umulh   x22, x0, x22 __LF                  \
+        cinv    x23, x23, lo __LF                  \
+        eor     x21, x21, x23 __LF                 \
+        eor     x22, x22, x23 __LF                 \
+        cmn     x23, #1 __LF                       \
+        adcs    x19, x19, x21 __LF                 \
+        adcs    x20, x20, x22 __LF                 \
+        adc     x1, x1, x23 __LF                   \
+        subs    x6, x6, x3 __LF                    \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x5 __LF                    \
+        ngc     x3, xzr __LF                       \
+        cmn     x3, #1 __LF                        \
+        eor     x6, x6, x3 __LF                    \
+        adcs    x6, x6, xzr __LF                   \
+        eor     x7, x7, x3 __LF                    \
+        adcs    x7, x7, xzr __LF                   \
+        eor     x8, x8, x3 __LF                    \
+        adc     x8, x8, xzr __LF                   \
+        subs    x9, x9, x12 __LF                   \
+        sbcs    x10, x10, x13 __LF                 \
+        sbcs    x11, x11, x14 __LF                 \
+        ngc     x14, xzr __LF                      \
+        cmn     x14, #1 __LF                       \
+        eor     x9, x9, x14 __LF                   \
+        adcs    x9, x9, xzr __LF                   \
+        eor     x10, x10, x14 __LF                 \
+        adcs    x10, x10, xzr __LF                 \
+        eor     x11, x11, x14 __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        eor     x14, x3, x14 __LF                  \
+        ldp     x21, x22, [P0] __LF                \
+        adds    x15, x15, x21 __LF                 \
+        adcs    x16, x16, x22 __LF                 \
+        ldp     x21, x22, [P0+16] __LF             \
+        adcs    x17, x17, x21 __LF                 \
+        adcs    x19, x19, x22 __LF                 \
+        ldp     x21, x22, [P0+32] __LF             \
+        adcs    x20, x20, x21 __LF                 \
+        adcs    x1, x1, x22 __LF                   \
+        adc     x2, xzr, xzr __LF                  \
+        stp     x15, x16, [P0] __LF                \
+        stp     x17, x19, [P0+16] __LF             \
+        stp     x20, x1, [P0+32] __LF              \
+        mul     x15, x6, x9 __LF                   \
+        mul     x21, x7, x10 __LF                  \
+        mul     x22, x8, x11 __LF                  \
+        umulh   x23, x6, x9 __LF                   \
+        umulh   x0, x7, x10 __LF                   \
+        umulh   x1, x8, x11 __LF                   \
+        adds    x23, x23, x21 __LF                 \
+        adcs    x0, x0, x22 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        adds    x16, x23, x15 __LF                 \
+        adcs    x17, x0, x23 __LF                  \
+        adcs    x19, x1, x0 __LF                   \
+        adc     x20, x1, xzr __LF                  \
+        adds    x17, x17, x15 __LF                 \
+        adcs    x19, x19, x23 __LF                 \
+        adcs    x20, x20, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x0, x6, x7 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        csetm   x23, lo __LF                       \
+        subs    x22, x10, x9 __LF                  \
+        cneg    x22, x22, lo __LF                  \
+        mul     x21, x0, x22 __LF                  \
+        umulh   x22, x0, x22 __LF                  \
+        cinv    x23, x23, lo __LF                  \
+        eor     x21, x21, x23 __LF                 \
+        eor     x22, x22, x23 __LF                 \
+        cmn     x23, #1 __LF                       \
+        adcs    x16, x16, x21 __LF                 \
+        adcs    x17, x17, x22 __LF                 \
+        adcs    x19, x19, x23 __LF                 \
+        adcs    x20, x20, x23 __LF                 \
+        adc     x1, x1, x23 __LF                   \
+        subs    x0, x6, x8 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        csetm   x23, lo __LF                       \
+        subs    x22, x11, x9 __LF                  \
+        cneg    x22, x22, lo __LF                  \
+        mul     x21, x0, x22 __LF                  \
+        umulh   x22, x0, x22 __LF                  \
+        cinv    x23, x23, lo __LF                  \
+        eor     x21, x21, x23 __LF                 \
+        eor     x22, x22, x23 __LF                 \
+        cmn     x23, #1 __LF                       \
+        adcs    x17, x17, x21 __LF                 \
+        adcs    x19, x19, x22 __LF                 \
+        adcs    x20, x20, x23 __LF                 \
+        adc     x1, x1, x23 __LF                   \
+        subs    x0, x7, x8 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        csetm   x23, lo __LF                       \
+        subs    x22, x11, x10 __LF                 \
+        cneg    x22, x22, lo __LF                  \
+        mul     x21, x0, x22 __LF                  \
+        umulh   x22, x0, x22 __LF                  \
+        cinv    x23, x23, lo __LF                  \
+        eor     x21, x21, x23 __LF                 \
+        eor     x22, x22, x23 __LF                 \
+        cmn     x23, #1 __LF                       \
+        adcs    x19, x19, x21 __LF                 \
+        adcs    x20, x20, x22 __LF                 \
+        adc     x1, x1, x23 __LF                   \
+        ldp     x3, x4, [P0] __LF                  \
+        ldp     x5, x6, [P0+16] __LF               \
+        ldp     x7, x8, [P0+32] __LF               \
+        cmn     x14, #1 __LF                       \
+        eor     x15, x15, x14 __LF                 \
+        adcs    x15, x15, x3 __LF                  \
+        eor     x16, x16, x14 __LF                 \
+        adcs    x16, x16, x4 __LF                  \
+        eor     x17, x17, x14 __LF                 \
+        adcs    x17, x17, x5 __LF                  \
+        eor     x19, x19, x14 __LF                 \
+        adcs    x19, x19, x6 __LF                  \
+        eor     x20, x20, x14 __LF                 \
+        adcs    x20, x20, x7 __LF                  \
+        eor     x1, x1, x14 __LF                   \
+        adcs    x1, x1, x8 __LF                    \
+        adcs    x9, x14, x2 __LF                   \
+        adcs    x10, x14, xzr __LF                 \
+        adcs    x11, x14, xzr __LF                 \
+        adc     x12, x14, xzr __LF                 \
+        adds    x19, x19, x3 __LF                  \
+        adcs    x20, x20, x4 __LF                  \
+        adcs    x1, x1, x5 __LF                    \
+        adcs    x9, x9, x6 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x8 __LF                  \
+        adc     x12, x12, x2 __LF                  \
+        lsl     x23, x15, #32 __LF                 \
+        add     x15, x23, x15 __LF                 \
+        lsr     x23, x15, #32 __LF                 \
+        subs    x23, x23, x15 __LF                 \
+        sbc     x22, x15, xzr __LF                 \
+        extr    x23, x22, x23, #32 __LF            \
+        lsr     x22, x22, #32 __LF                 \
+        adds    x22, x22, x15 __LF                 \
+        adc     x21, xzr, xzr __LF                 \
+        subs    x16, x16, x23 __LF                 \
+        sbcs    x17, x17, x22 __LF                 \
+        sbcs    x19, x19, x21 __LF                 \
+        sbcs    x20, x20, xzr __LF                 \
+        sbcs    x1, x1, xzr __LF                   \
+        sbc     x15, x15, xzr __LF                 \
+        lsl     x23, x16, #32 __LF                 \
+        add     x16, x23, x16 __LF                 \
+        lsr     x23, x16, #32 __LF                 \
+        subs    x23, x23, x16 __LF                 \
+        sbc     x22, x16, xzr __LF                 \
+        extr    x23, x22, x23, #32 __LF            \
+        lsr     x22, x22, #32 __LF                 \
+        adds    x22, x22, x16 __LF                 \
+        adc     x21, xzr, xzr __LF                 \
+        subs    x17, x17, x23 __LF                 \
+        sbcs    x19, x19, x22 __LF                 \
+        sbcs    x20, x20, x21 __LF                 \
+        sbcs    x1, x1, xzr __LF                   \
+        sbcs    x15, x15, xzr __LF                 \
+        sbc     x16, x16, xzr __LF                 \
+        lsl     x23, x17, #32 __LF                 \
+        add     x17, x23, x17 __LF                 \
+        lsr     x23, x17, #32 __LF                 \
+        subs    x23, x23, x17 __LF                 \
+        sbc     x22, x17, xzr __LF                 \
+        extr    x23, x22, x23, #32 __LF            \
+        lsr     x22, x22, #32 __LF                 \
+        adds    x22, x22, x17 __LF                 \
+        adc     x21, xzr, xzr __LF                 \
+        subs    x19, x19, x23 __LF                 \
+        sbcs    x20, x20, x22 __LF                 \
+        sbcs    x1, x1, x21 __LF                   \
+        sbcs    x15, x15, xzr __LF                 \
+        sbcs    x16, x16, xzr __LF                 \
+        sbc     x17, x17, xzr __LF                 \
+        adds    x9, x9, x15 __LF                   \
+        adcs    x10, x10, x16 __LF                 \
+        adcs    x11, x11, x17 __LF                 \
+        adc     x12, x12, xzr __LF                 \
+        add     x22, x12, #1 __LF                  \
+        lsl     x21, x22, #32 __LF                 \
+        subs    x0, x22, x21 __LF                  \
+        sbc     x21, x21, xzr __LF                 \
+        adds    x19, x19, x0 __LF                  \
+        adcs    x20, x20, x21 __LF                 \
+        adcs    x1, x1, x22 __LF                   \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        adcs    x11, x11, xzr __LF                 \
+        csetm   x22, lo __LF                       \
+        mov     x23, #4294967295 __LF              \
+        and     x23, x23, x22 __LF                 \
+        adds    x19, x19, x23 __LF                 \
+        eor     x23, x23, x22 __LF                 \
+        adcs    x20, x20, x23 __LF                 \
+        mov     x23, #-2 __LF                      \
+        and     x23, x23, x22 __LF                 \
+        adcs    x1, x1, x23 __LF                   \
+        adcs    x9, x9, x22 __LF                   \
+        adcs    x10, x10, x22 __LF                 \
+        adc     x11, x11, x22 __LF                 \
+        stp     x19, x20, [P0] __LF                \
+        stp     x1, x9, [P0+16] __LF               \
+        stp     x10, x11, [P0+32]
+
+// Corresponds exactly to bignum_montsqr_p384
+
+#define montsqr_p384(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        ldp     x4, x5, [P1+16] __LF               \
+        ldp     x6, x7, [P1+32] __LF               \
+        mul     x14, x2, x3 __LF                   \
+        mul     x15, x2, x4 __LF                   \
+        mul     x16, x3, x4 __LF                   \
+        mul     x8, x2, x2 __LF                    \
+        mul     x10, x3, x3 __LF                   \
+        mul     x12, x4, x4 __LF                   \
+        umulh   x17, x2, x3 __LF                   \
+        adds    x15, x15, x17 __LF                 \
+        umulh   x17, x2, x4 __LF                   \
+        adcs    x16, x16, x17 __LF                 \
+        umulh   x17, x3, x4 __LF                   \
+        adcs    x17, x17, xzr __LF                 \
+        umulh   x9, x2, x2 __LF                    \
+        umulh   x11, x3, x3 __LF                   \
+        umulh   x13, x4, x4 __LF                   \
+        adds    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adcs    x17, x17, x17 __LF                 \
+        adc     x13, x13, xzr __LF                 \
+        adds    x9, x9, x14 __LF                   \
+        adcs    x10, x10, x15 __LF                 \
+        adcs    x11, x11, x16 __LF                 \
+        adcs    x12, x12, x17 __LF                 \
+        adc     x13, x13, xzr __LF                 \
+        lsl     x16, x8, #32 __LF                  \
+        add     x8, x16, x8 __LF                   \
+        lsr     x16, x8, #32 __LF                  \
+        subs    x16, x16, x8 __LF                  \
+        sbc     x15, x8, xzr __LF                  \
+        extr    x16, x15, x16, #32 __LF            \
+        lsr     x15, x15, #32 __LF                 \
+        adds    x15, x15, x8 __LF                  \
+        adc     x14, xzr, xzr __LF                 \
+        subs    x9, x9, x16 __LF                   \
+        sbcs    x10, x10, x15 __LF                 \
+        sbcs    x11, x11, x14 __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x8, x8, xzr __LF                   \
+        lsl     x16, x9, #32 __LF                  \
+        add     x9, x16, x9 __LF                   \
+        lsr     x16, x9, #32 __LF                  \
+        subs    x16, x16, x9 __LF                  \
+        sbc     x15, x9, xzr __LF                  \
+        extr    x16, x15, x16, #32 __LF            \
+        lsr     x15, x15, #32 __LF                 \
+        adds    x15, x15, x9 __LF                  \
+        adc     x14, xzr, xzr __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        sbcs    x11, x11, x15 __LF                 \
+        sbcs    x12, x12, x14 __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x8, x8, xzr __LF                   \
+        sbc     x9, x9, xzr __LF                   \
+        lsl     x16, x10, #32 __LF                 \
+        add     x10, x16, x10 __LF                 \
+        lsr     x16, x10, #32 __LF                 \
+        subs    x16, x16, x10 __LF                 \
+        sbc     x15, x10, xzr __LF                 \
+        extr    x16, x15, x16, #32 __LF            \
+        lsr     x15, x15, #32 __LF                 \
+        adds    x15, x15, x10 __LF                 \
+        adc     x14, xzr, xzr __LF                 \
+        subs    x11, x11, x16 __LF                 \
+        sbcs    x12, x12, x15 __LF                 \
+        sbcs    x13, x13, x14 __LF                 \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        stp     x11, x12, [P0] __LF                \
+        stp     x13, x8, [P0+16] __LF              \
+        stp     x9, x10, [P0+32] __LF              \
+        mul     x8, x2, x5 __LF                    \
+        mul     x14, x3, x6 __LF                   \
+        mul     x15, x4, x7 __LF                   \
+        umulh   x16, x2, x5 __LF                   \
+        umulh   x17, x3, x6 __LF                   \
+        umulh   x1, x4, x7 __LF                    \
+        adds    x16, x16, x14 __LF                 \
+        adcs    x17, x17, x15 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        adds    x9, x16, x8 __LF                   \
+        adcs    x10, x17, x16 __LF                 \
+        adcs    x11, x1, x17 __LF                  \
+        adc     x12, x1, xzr __LF                  \
+        adds    x10, x10, x8 __LF                  \
+        adcs    x11, x11, x16 __LF                 \
+        adcs    x12, x12, x17 __LF                 \
+        adc     x13, x1, xzr __LF                  \
+        subs    x17, x2, x3 __LF                   \
+        cneg    x17, x17, lo __LF                  \
+        csetm   x14, lo __LF                       \
+        subs    x15, x6, x5 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        mul     x16, x17, x15 __LF                 \
+        umulh   x15, x17, x15 __LF                 \
+        cinv    x14, x14, lo __LF                  \
+        eor     x16, x16, x14 __LF                 \
+        eor     x15, x15, x14 __LF                 \
+        cmn     x14, #1 __LF                       \
+        adcs    x9, x9, x16 __LF                   \
+        adcs    x10, x10, x15 __LF                 \
+        adcs    x11, x11, x14 __LF                 \
+        adcs    x12, x12, x14 __LF                 \
+        adc     x13, x13, x14 __LF                 \
+        subs    x17, x2, x4 __LF                   \
+        cneg    x17, x17, lo __LF                  \
+        csetm   x14, lo __LF                       \
+        subs    x15, x7, x5 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        mul     x16, x17, x15 __LF                 \
+        umulh   x15, x17, x15 __LF                 \
+        cinv    x14, x14, lo __LF                  \
+        eor     x16, x16, x14 __LF                 \
+        eor     x15, x15, x14 __LF                 \
+        cmn     x14, #1 __LF                       \
+        adcs    x10, x10, x16 __LF                 \
+        adcs    x11, x11, x15 __LF                 \
+        adcs    x12, x12, x14 __LF                 \
+        adc     x13, x13, x14 __LF                 \
+        subs    x17, x3, x4 __LF                   \
+        cneg    x17, x17, lo __LF                  \
+        csetm   x14, lo __LF                       \
+        subs    x15, x7, x6 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        mul     x16, x17, x15 __LF                 \
+        umulh   x15, x17, x15 __LF                 \
+        cinv    x14, x14, lo __LF                  \
+        eor     x16, x16, x14 __LF                 \
+        eor     x15, x15, x14 __LF                 \
+        cmn     x14, #1 __LF                       \
+        adcs    x11, x11, x16 __LF                 \
+        adcs    x12, x12, x15 __LF                 \
+        adc     x13, x13, x14 __LF                 \
+        adds    x8, x8, x8 __LF                    \
+        adcs    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adc     x17, xzr, xzr __LF                 \
+        ldp     x2, x3, [P0] __LF                  \
+        adds    x8, x8, x2 __LF                    \
+        adcs    x9, x9, x3 __LF                    \
+        ldp     x2, x3, [P0+16] __LF               \
+        adcs    x10, x10, x2 __LF                  \
+        adcs    x11, x11, x3 __LF                  \
+        ldp     x2, x3, [P0+32] __LF               \
+        adcs    x12, x12, x2 __LF                  \
+        adcs    x13, x13, x3 __LF                  \
+        adc     x17, x17, xzr __LF                 \
+        lsl     x4, x8, #32 __LF                   \
+        add     x8, x4, x8 __LF                    \
+        lsr     x4, x8, #32 __LF                   \
+        subs    x4, x4, x8 __LF                    \
+        sbc     x3, x8, xzr __LF                   \
+        extr    x4, x3, x4, #32 __LF               \
+        lsr     x3, x3, #32 __LF                   \
+        adds    x3, x3, x8 __LF                    \
+        adc     x2, xzr, xzr __LF                  \
+        subs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, x2 __LF                  \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x8, x8, xzr __LF                   \
+        lsl     x4, x9, #32 __LF                   \
+        add     x9, x4, x9 __LF                    \
+        lsr     x4, x9, #32 __LF                   \
+        subs    x4, x4, x9 __LF                    \
+        sbc     x3, x9, xzr __LF                   \
+        extr    x4, x3, x4, #32 __LF               \
+        lsr     x3, x3, #32 __LF                   \
+        adds    x3, x3, x9 __LF                    \
+        adc     x2, xzr, xzr __LF                  \
+        subs    x10, x10, x4 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x12, x12, x2 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x8, x8, xzr __LF                   \
+        sbc     x9, x9, xzr __LF                   \
+        lsl     x4, x10, #32 __LF                  \
+        add     x10, x4, x10 __LF                  \
+        lsr     x4, x10, #32 __LF                  \
+        subs    x4, x4, x10 __LF                   \
+        sbc     x3, x10, xzr __LF                  \
+        extr    x4, x3, x4, #32 __LF               \
+        lsr     x3, x3, #32 __LF                   \
+        adds    x3, x3, x10 __LF                   \
+        adc     x2, xzr, xzr __LF                  \
+        subs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        sbcs    x13, x13, x2 __LF                  \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        adds    x17, x17, x8 __LF                  \
+        adcs    x8, x9, xzr __LF                   \
+        adcs    x9, x10, xzr __LF                  \
+        adcs    x10, xzr, xzr __LF                 \
+        mul     x1, x5, x5 __LF                    \
+        adds    x11, x11, x1 __LF                  \
+        mul     x14, x6, x6 __LF                   \
+        mul     x15, x7, x7 __LF                   \
+        umulh   x1, x5, x5 __LF                    \
+        adcs    x12, x12, x1 __LF                  \
+        umulh   x1, x6, x6 __LF                    \
+        adcs    x13, x13, x14 __LF                 \
+        adcs    x17, x17, x1 __LF                  \
+        umulh   x1, x7, x7 __LF                    \
+        adcs    x8, x8, x15 __LF                   \
+        adcs    x9, x9, x1 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        mul     x1, x5, x6 __LF                    \
+        mul     x14, x5, x7 __LF                   \
+        mul     x15, x6, x7 __LF                   \
+        umulh   x16, x5, x6 __LF                   \
+        adds    x14, x14, x16 __LF                 \
+        umulh   x16, x5, x7 __LF                   \
+        adcs    x15, x15, x16 __LF                 \
+        umulh   x16, x6, x7 __LF                   \
+        adc     x16, x16, xzr __LF                 \
+        adds    x1, x1, x1 __LF                    \
+        adcs    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, xzr, xzr __LF                  \
+        adds    x12, x12, x1 __LF                  \
+        adcs    x13, x13, x14 __LF                 \
+        adcs    x17, x17, x15 __LF                 \
+        adcs    x8, x8, x16 __LF                   \
+        adcs    x9, x9, x5 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        mov     x1, #-4294967295 __LF              \
+        mov     x14, #4294967295 __LF              \
+        mov     x15, #1 __LF                       \
+        cmn     x11, x1 __LF                       \
+        adcs    xzr, x12, x14 __LF                 \
+        adcs    xzr, x13, x15 __LF                 \
+        adcs    xzr, x17, xzr __LF                 \
+        adcs    xzr, x8, xzr __LF                  \
+        adcs    xzr, x9, xzr __LF                  \
+        adc     x10, x10, xzr __LF                 \
+        neg     x10, x10 __LF                      \
+        and     x1, x1, x10 __LF                   \
+        adds    x11, x11, x1 __LF                  \
+        and     x14, x14, x10 __LF                 \
+        adcs    x12, x12, x14 __LF                 \
+        and     x15, x15, x10 __LF                 \
+        adcs    x13, x13, x15 __LF                 \
+        adcs    x17, x17, xzr __LF                 \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        stp     x11, x12, [P0] __LF                \
+        stp     x13, x17, [P0+16] __LF             \
+        stp     x8, x9, [P0+32]
+
+// Corresponds exactly to bignum_sub_p384
+
+#define sub_p384(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        csetm   x3, lo __LF                        \
+        mov     x4, #4294967295 __LF               \
+        and     x4, x4, x3 __LF                    \
+        adds    x5, x5, x4 __LF                    \
+        eor     x4, x4, x3 __LF                    \
+        adcs    x6, x6, x4 __LF                    \
+        mov     x4, #-2 __LF                       \
+        and     x4, x4, x3 __LF                    \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        adcs    x9, x9, x3 __LF                    \
+        adc     x10, x10, x3 __LF                  \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32]
+
+S2N_BN_SYMBOL(p384_montjmixadd):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        montsqr_p384(zp2,z_1)
+        montmul_p384(y2a,z_1,y_2)
+
+        montmul_p384(x2a,zp2,x_2)
+        montmul_p384(y2a,zp2,y2a)
+
+        sub_p384(xd,x2a,x_1)
+        sub_p384(yd,y2a,y_1)
+
+        montsqr_p384(zz,xd)
+        montsqr_p384(ww,yd)
+
+        montmul_p384(zzx1,zz,x_1)
+        montmul_p384(zzx2,zz,x2a)
+
+        sub_p384(resx,ww,zzx1)
+        sub_p384(t1,zzx2,zzx1)
+
+        montmul_p384(resz,xd,z_1)
+
+        sub_p384(resx,resx,zzx2)
+
+        sub_p384(t2,zzx1,resx)
+
+        montmul_p384(t1,t1,y_1)
+        montmul_p384(t2,yd,t2)
+
+        sub_p384(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        ldp     x4, x5, [z_1+32]
+        orr     x6, x0, x1
+        orr     x7, x2, x3
+        orr     x8, x4, x5
+        orr     x6, x6, x7
+        orr     x6, x6, x8
+        cmp     x6, xzr
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^384 - p_384),
+// hence giving 0 + p2 = p2 for the final result.
+
+        ldp     x0, x1, [resx]
+        ldp     x19, x20, [x_2]
+        csel    x0, x0, x19, ne
+        csel    x1, x1, x20, ne
+        ldp     x2, x3, [resx+16]
+        ldp     x19, x20, [x_2+16]
+        csel    x2, x2, x19, ne
+        csel    x3, x3, x20, ne
+        ldp     x4, x5, [resx+32]
+        ldp     x19, x20, [x_2+32]
+        csel    x4, x4, x19, ne
+        csel    x5, x5, x20, ne
+
+        ldp     x6, x7, [resy]
+        ldp     x19, x20, [y_2]
+        csel    x6, x6, x19, ne
+        csel    x7, x7, x20, ne
+        ldp     x8, x9, [resy+16]
+        ldp     x19, x20, [y_2+16]
+        csel    x8, x8, x19, ne
+        csel    x9, x9, x20, ne
+        ldp     x10, x11, [resy+32]
+        ldp     x19, x20, [y_2+32]
+        csel    x10, x10, x19, ne
+        csel    x11, x11, x20, ne
+
+        ldp     x12, x13, [resz]
+        mov     x19, #0xffffffff00000001
+        mov     x20, #0x00000000ffffffff
+        csel    x12, x12, x19, ne
+        csel    x13, x13, x20, ne
+        ldp     x14, x15, [resz+16]
+        mov     x19, #1
+        csel    x14, x14, x19, ne
+        csel    x15, x15, xzr, ne
+        ldp     x16, x17, [resz+32]
+        csel    x16, x16, xzr, ne
+        csel    x17, x17, xzr, ne
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [x_3+32]
+        stp     x6, x7, [y_3]
+        stp     x8, x9, [y_3+16]
+        stp     x10, x11, [y_3+32]
+        stp     x12, x13, [z_3]
+        stp     x14, x15, [z_3+16]
+        stp     x16, x17, [z_3+32]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+
+        ldp     x25, x26, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd_alt.S
new file mode 100644
index 00000000000..44756c0bd6f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd_alt.S
@@ -0,0 +1,941 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates
+//
+//    extern void p384_montjmixadd_alt
+//      (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd_alt)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 48
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x24
+#define input_x x25
+#define input_y x26
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds exactly to bignum_montmul_p384_alt
+
+#define montmul_p384(P0,P1,P2)                  \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x12, x3, x5 __LF                   \
+        umulh   x13, x3, x5 __LF                   \
+        mul     x11, x3, x6 __LF                   \
+        umulh   x14, x3, x6 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x7, x8, [P2+16] __LF               \
+        mul     x11, x3, x7 __LF                   \
+        umulh   x15, x3, x7 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x16, x3, x8 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        ldp     x9, x10, [P2+32] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x17, x3, x9 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x19, x3, x10 __LF                  \
+        adcs    x17, x17, x11 __LF                 \
+        adc     x19, x19, xzr __LF                 \
+        mul     x11, x4, x5 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x6 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x7 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x19, x19, x11 __LF                 \
+        cset    x20, cs __LF                       \
+        umulh   x11, x4, x5 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x6 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        umulh   x11, x4, x7 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x4, x10 __LF                  \
+        adc     x20, x20, x11 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x3, x6 __LF                   \
+        adcs    x15, x15, x11 __LF                 \
+        mul     x11, x3, x7 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x3, x8 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x3, x9 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        adcs    x20, x20, x11 __LF                 \
+        cset    x21, cs __LF                       \
+        umulh   x11, x3, x5 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        umulh   x11, x3, x6 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        umulh   x11, x3, x7 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        umulh   x11, x3, x8 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x3, x9 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x3, x10 __LF                  \
+        adc     x21, x21, x11 __LF                 \
+        mul     x11, x4, x5 __LF                   \
+        adds    x15, x15, x11 __LF                 \
+        mul     x11, x4, x6 __LF                   \
+        adcs    x16, x16, x11 __LF                 \
+        mul     x11, x4, x7 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x21, x21, x11 __LF                 \
+        cset    x22, cs __LF                       \
+        umulh   x11, x4, x5 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        umulh   x11, x4, x6 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        umulh   x11, x4, x7 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        umulh   x11, x4, x10 __LF                  \
+        adc     x22, x22, x11 __LF                 \
+        ldp     x3, x4, [P1+32] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        adds    x16, x16, x11 __LF                 \
+        mul     x11, x3, x6 __LF                   \
+        adcs    x17, x17, x11 __LF                 \
+        mul     x11, x3, x7 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x3, x8 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        mul     x11, x3, x9 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        adcs    x22, x22, x11 __LF                 \
+        cset    x2, cs __LF                        \
+        umulh   x11, x3, x5 __LF                   \
+        adds    x17, x17, x11 __LF                 \
+        umulh   x11, x3, x6 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        umulh   x11, x3, x7 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x3, x8 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        umulh   x11, x3, x9 __LF                   \
+        adcs    x22, x22, x11 __LF                 \
+        umulh   x11, x3, x10 __LF                  \
+        adc     x2, x2, x11 __LF                   \
+        mul     x11, x4, x5 __LF                   \
+        adds    x17, x17, x11 __LF                 \
+        mul     x11, x4, x6 __LF                   \
+        adcs    x19, x19, x11 __LF                 \
+        mul     x11, x4, x7 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x22, x22, x11 __LF                 \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x2, x2, x11 __LF                   \
+        cset    x1, cs __LF                        \
+        umulh   x11, x4, x5 __LF                   \
+        adds    x19, x19, x11 __LF                 \
+        umulh   x11, x4, x6 __LF                   \
+        adcs    x20, x20, x11 __LF                 \
+        umulh   x11, x4, x7 __LF                   \
+        adcs    x21, x21, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x22, x22, x11 __LF                 \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x2, x2, x11 __LF                   \
+        umulh   x11, x4, x10 __LF                  \
+        adc     x1, x1, x11 __LF                   \
+        lsl     x7, x12, #32 __LF                  \
+        add     x12, x7, x12 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x12 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x12 __LF                   \
+        umulh   x6, x6, x12 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x12 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x13, x13, x7 __LF                  \
+        sbcs    x14, x14, x6 __LF                  \
+        sbcs    x15, x15, x5 __LF                  \
+        sbcs    x16, x16, xzr __LF                 \
+        sbcs    x17, x17, xzr __LF                 \
+        sbc     x12, x12, xzr __LF                 \
+        lsl     x7, x13, #32 __LF                  \
+        add     x13, x7, x13 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x13 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x13 __LF                   \
+        umulh   x6, x6, x13 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x13 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x14, x14, x7 __LF                  \
+        sbcs    x15, x15, x6 __LF                  \
+        sbcs    x16, x16, x5 __LF                  \
+        sbcs    x17, x17, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        lsl     x7, x14, #32 __LF                  \
+        add     x14, x7, x14 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x14 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x14 __LF                   \
+        umulh   x6, x6, x14 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x14 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x15, x15, x7 __LF                  \
+        sbcs    x16, x16, x6 __LF                  \
+        sbcs    x17, x17, x5 __LF                  \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x14, x14, xzr __LF                 \
+        lsl     x7, x15, #32 __LF                  \
+        add     x15, x7, x15 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x15 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x15 __LF                   \
+        umulh   x6, x6, x15 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x15 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x16, x16, x7 __LF                  \
+        sbcs    x17, x17, x6 __LF                  \
+        sbcs    x12, x12, x5 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x15, x15, xzr __LF                 \
+        lsl     x7, x16, #32 __LF                  \
+        add     x16, x7, x16 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x16 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x16 __LF                   \
+        umulh   x6, x6, x16 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x16 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x17, x17, x7 __LF                  \
+        sbcs    x12, x12, x6 __LF                  \
+        sbcs    x13, x13, x5 __LF                  \
+        sbcs    x14, x14, xzr __LF                 \
+        sbcs    x15, x15, xzr __LF                 \
+        sbc     x16, x16, xzr __LF                 \
+        lsl     x7, x17, #32 __LF                  \
+        add     x17, x7, x17 __LF                  \
+        mov     x7, #0xffffffff00000001 __LF       \
+        umulh   x7, x7, x17 __LF                   \
+        mov     x6, #0xffffffff __LF               \
+        mul     x5, x6, x17 __LF                   \
+        umulh   x6, x6, x17 __LF                   \
+        adds    x7, x7, x5 __LF                    \
+        adcs    x6, x6, x17 __LF                   \
+        adc     x5, xzr, xzr __LF                  \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, x6 __LF                  \
+        sbcs    x14, x14, x5 __LF                  \
+        sbcs    x15, x15, xzr __LF                 \
+        sbcs    x16, x16, xzr __LF                 \
+        sbc     x17, x17, xzr __LF                 \
+        adds    x12, x12, x19 __LF                 \
+        adcs    x13, x13, x20 __LF                 \
+        adcs    x14, x14, x21 __LF                 \
+        adcs    x15, x15, x22 __LF                 \
+        adcs    x16, x16, x2 __LF                  \
+        adcs    x17, x17, x1 __LF                  \
+        adc     x10, xzr, xzr __LF                 \
+        mov     x11, #0xffffffff00000001 __LF      \
+        adds    x19, x12, x11 __LF                 \
+        mov     x11, #0xffffffff __LF              \
+        adcs    x20, x13, x11 __LF                 \
+        mov     x11, #0x1 __LF                     \
+        adcs    x21, x14, x11 __LF                 \
+        adcs    x22, x15, xzr __LF                 \
+        adcs    x2, x16, xzr __LF                  \
+        adcs    x1, x17, xzr __LF                  \
+        adcs    x10, x10, xzr __LF                 \
+        csel    x12, x12, x19, eq __LF             \
+        csel    x13, x13, x20, eq __LF             \
+        csel    x14, x14, x21, eq __LF             \
+        csel    x15, x15, x22, eq __LF             \
+        csel    x16, x16, x2, eq __LF              \
+        csel    x17, x17, x1, eq __LF              \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x15, [P0+16] __LF             \
+        stp     x16, x17, [P0+32]
+
+// Corresponds exactly to bignum_montsqr_p384_alt
+
+#define montsqr_p384(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x8, x2, x4 __LF                    \
+        adds    x10, x10, x8 __LF                  \
+        mul     x11, x2, x5 __LF                   \
+        mul     x8, x3, x4 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x8, x3, x5 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        ldp     x6, x7, [P1+32] __LF               \
+        mul     x13, x2, x7 __LF                   \
+        mul     x8, x3, x6 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x14, x2, x7 __LF                   \
+        mul     x8, x3, x7 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x15, x5, x6 __LF                   \
+        adcs    x15, x15, xzr __LF                 \
+        umulh   x16, x5, x6 __LF                   \
+        adc     x16, x16, xzr __LF                 \
+        umulh   x8, x2, x4 __LF                    \
+        adds    x11, x11, x8 __LF                  \
+        umulh   x8, x3, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x3, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x8, x3, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x3, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        adc     x16, x16, xzr __LF                 \
+        mul     x8, x2, x6 __LF                    \
+        adds    x12, x12, x8 __LF                  \
+        mul     x8, x4, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x4, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x8, x4, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x5, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        mul     x17, x6, x7 __LF                   \
+        adcs    x17, x17, xzr __LF                 \
+        umulh   x19, x6, x7 __LF                   \
+        adc     x19, x19, xzr __LF                 \
+        umulh   x8, x2, x6 __LF                    \
+        adds    x13, x13, x8 __LF                  \
+        umulh   x8, x4, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x4, x6 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        umulh   x8, x4, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x5, x7 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        adc     x19, x19, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adcs    x17, x17, x17 __LF                 \
+        adcs    x19, x19, x19 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x8, x2, x2 __LF                    \
+        mul     x2, x2, x2 __LF                    \
+        adds    x9, x9, x8 __LF                    \
+        mul     x8, x3, x3 __LF                    \
+        adcs    x10, x10, x8 __LF                  \
+        umulh   x8, x3, x3 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        mul     x8, x4, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x4, x4 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x5, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x5, x5 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x6, x6 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x6, x6 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        mul     x8, x7, x7 __LF                    \
+        adcs    x19, x19, x8 __LF                  \
+        umulh   x8, x7, x7 __LF                    \
+        adc     x20, x20, x8 __LF                  \
+        lsl     x5, x2, #32 __LF                   \
+        add     x2, x5, x2 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x2 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x2 __LF                    \
+        umulh   x4, x4, x2 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x2 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x9, x9, x5 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x2, x2, xzr __LF                   \
+        lsl     x5, x9, #32 __LF                   \
+        add     x9, x5, x9 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x9 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x9 __LF                    \
+        umulh   x4, x4, x9 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x10, x10, x5 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x9, x9, xzr __LF                   \
+        lsl     x5, x10, #32 __LF                  \
+        add     x10, x5, x10 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x10 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x10 __LF                   \
+        umulh   x4, x4, x10 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x10 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x11, x11, x5 __LF                  \
+        sbcs    x12, x12, x4 __LF                  \
+        sbcs    x13, x13, x3 __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        lsl     x5, x11, #32 __LF                  \
+        add     x11, x5, x11 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x11 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x11 __LF                   \
+        umulh   x4, x4, x11 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x12, x12, x5 __LF                  \
+        sbcs    x13, x13, x4 __LF                  \
+        sbcs    x2, x2, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        lsl     x5, x12, #32 __LF                  \
+        add     x12, x5, x12 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x12 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x12 __LF                   \
+        umulh   x4, x4, x12 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x12 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x13, x13, x5 __LF                  \
+        sbcs    x2, x2, x4 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbc     x12, x12, xzr __LF                 \
+        lsl     x5, x13, #32 __LF                  \
+        add     x13, x5, x13 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x13 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x13 __LF                   \
+        umulh   x4, x4, x13 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x13 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x2, x2, x5 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        adds    x2, x2, x14 __LF                   \
+        adcs    x9, x9, x15 __LF                   \
+        adcs    x10, x10, x16 __LF                 \
+        adcs    x11, x11, x17 __LF                 \
+        adcs    x12, x12, x19 __LF                 \
+        adcs    x13, x13, x20 __LF                 \
+        adc     x6, xzr, xzr __LF                  \
+        mov     x8, #-4294967295 __LF              \
+        adds    x14, x2, x8 __LF                   \
+        mov     x8, #4294967295 __LF               \
+        adcs    x15, x9, x8 __LF                   \
+        mov     x8, #1 __LF                        \
+        adcs    x16, x10, x8 __LF                  \
+        adcs    x17, x11, xzr __LF                 \
+        adcs    x19, x12, xzr __LF                 \
+        adcs    x20, x13, xzr __LF                 \
+        adcs    x6, x6, xzr __LF                   \
+        csel    x2, x2, x14, eq __LF               \
+        csel    x9, x9, x15, eq __LF               \
+        csel    x10, x10, x16, eq __LF             \
+        csel    x11, x11, x17, eq __LF             \
+        csel    x12, x12, x19, eq __LF             \
+        csel    x13, x13, x20, eq __LF             \
+        stp     x2, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16] __LF             \
+        stp     x12, x13, [P0+32]
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe). In
+// fact, with the Karatsuba-based Montgomery mul here, we don't even
+// *need* the restriction that the other argument is reduced.
+
+#define amontsqr_p384(P0,P1)                    \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x8, x2, x4 __LF                    \
+        adds    x10, x10, x8 __LF                  \
+        mul     x11, x2, x5 __LF                   \
+        mul     x8, x3, x4 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x8, x3, x5 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        ldp     x6, x7, [P1+32] __LF               \
+        mul     x13, x2, x7 __LF                   \
+        mul     x8, x3, x6 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x14, x2, x7 __LF                   \
+        mul     x8, x3, x7 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x15, x5, x6 __LF                   \
+        adcs    x15, x15, xzr __LF                 \
+        umulh   x16, x5, x6 __LF                   \
+        adc     x16, x16, xzr __LF                 \
+        umulh   x8, x2, x4 __LF                    \
+        adds    x11, x11, x8 __LF                  \
+        umulh   x8, x3, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x3, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        umulh   x8, x3, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x3, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        adc     x16, x16, xzr __LF                 \
+        mul     x8, x2, x6 __LF                    \
+        adds    x12, x12, x8 __LF                  \
+        mul     x8, x4, x5 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x4, x6 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        mul     x8, x4, x7 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x5, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        mul     x17, x6, x7 __LF                   \
+        adcs    x17, x17, xzr __LF                 \
+        umulh   x19, x6, x7 __LF                   \
+        adc     x19, x19, xzr __LF                 \
+        umulh   x8, x2, x6 __LF                    \
+        adds    x13, x13, x8 __LF                  \
+        umulh   x8, x4, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x4, x6 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        umulh   x8, x4, x7 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x5, x7 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        adc     x19, x19, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adcs    x17, x17, x17 __LF                 \
+        adcs    x19, x19, x19 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x8, x2, x2 __LF                    \
+        mul     x2, x2, x2 __LF                    \
+        adds    x9, x9, x8 __LF                    \
+        mul     x8, x3, x3 __LF                    \
+        adcs    x10, x10, x8 __LF                  \
+        umulh   x8, x3, x3 __LF                    \
+        adcs    x11, x11, x8 __LF                  \
+        mul     x8, x4, x4 __LF                    \
+        adcs    x12, x12, x8 __LF                  \
+        umulh   x8, x4, x4 __LF                    \
+        adcs    x13, x13, x8 __LF                  \
+        mul     x8, x5, x5 __LF                    \
+        adcs    x14, x14, x8 __LF                  \
+        umulh   x8, x5, x5 __LF                    \
+        adcs    x15, x15, x8 __LF                  \
+        mul     x8, x6, x6 __LF                    \
+        adcs    x16, x16, x8 __LF                  \
+        umulh   x8, x6, x6 __LF                    \
+        adcs    x17, x17, x8 __LF                  \
+        mul     x8, x7, x7 __LF                    \
+        adcs    x19, x19, x8 __LF                  \
+        umulh   x8, x7, x7 __LF                    \
+        adc     x20, x20, x8 __LF                  \
+        lsl     x5, x2, #32 __LF                   \
+        add     x2, x5, x2 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x2 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x2 __LF                    \
+        umulh   x4, x4, x2 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x2 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x9, x9, x5 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbc     x2, x2, xzr __LF                   \
+        lsl     x5, x9, #32 __LF                   \
+        add     x9, x5, x9 __LF                    \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x9 __LF                    \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x9 __LF                    \
+        umulh   x4, x4, x9 __LF                    \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x10, x10, x5 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x9, x9, xzr __LF                   \
+        lsl     x5, x10, #32 __LF                  \
+        add     x10, x5, x10 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x10 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x10 __LF                   \
+        umulh   x4, x4, x10 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x10 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x11, x11, x5 __LF                  \
+        sbcs    x12, x12, x4 __LF                  \
+        sbcs    x13, x13, x3 __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        lsl     x5, x11, #32 __LF                  \
+        add     x11, x5, x11 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x11 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x11 __LF                   \
+        umulh   x4, x4, x11 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x12, x12, x5 __LF                  \
+        sbcs    x13, x13, x4 __LF                  \
+        sbcs    x2, x2, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        lsl     x5, x12, #32 __LF                  \
+        add     x12, x5, x12 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x12 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x12 __LF                   \
+        umulh   x4, x4, x12 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x12 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x13, x13, x5 __LF                  \
+        sbcs    x2, x2, x4 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbc     x12, x12, xzr __LF                 \
+        lsl     x5, x13, #32 __LF                  \
+        add     x13, x5, x13 __LF                  \
+        mov     x5, #-4294967295 __LF              \
+        umulh   x5, x5, x13 __LF                   \
+        mov     x4, #4294967295 __LF               \
+        mul     x3, x4, x13 __LF                   \
+        umulh   x4, x4, x13 __LF                   \
+        adds    x5, x5, x3 __LF                    \
+        adcs    x4, x4, x13 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        subs    x2, x2, x5 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        adds    x2, x2, x14 __LF                   \
+        adcs    x9, x9, x15 __LF                   \
+        adcs    x10, x10, x16 __LF                 \
+        adcs    x11, x11, x17 __LF                 \
+        adcs    x12, x12, x19 __LF                 \
+        adcs    x13, x13, x20 __LF                 \
+        mov     x14, #-4294967295 __LF             \
+        mov     x15, #4294967295 __LF              \
+        csel    x14, x14, xzr, cs __LF             \
+        csel    x15, x15, xzr, cs __LF             \
+        cset    x16, cs __LF                       \
+        adds    x2, x2, x14 __LF                   \
+        adcs    x9, x9, x15 __LF                   \
+        adcs    x10, x10, x16 __LF                 \
+        adcs    x11, x11, xzr __LF                 \
+        adcs    x12, x12, xzr __LF                 \
+        adc     x13, x13, xzr __LF                 \
+        stp     x2, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16] __LF             \
+        stp     x12, x13, [P0+32]
+
+// Corresponds exactly to bignum_sub_p384
+
+#define sub_p384(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        csetm   x3, lo __LF                        \
+        mov     x4, #4294967295 __LF               \
+        and     x4, x4, x3 __LF                    \
+        adds    x5, x5, x4 __LF                    \
+        eor     x4, x4, x3 __LF                    \
+        adcs    x6, x6, x4 __LF                    \
+        mov     x4, #-2 __LF                       \
+        and     x4, x4, x3 __LF                    \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        adcs    x9, x9, x3 __LF                    \
+        adc     x10, x10, x3 __LF                  \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32]
+
+S2N_BN_SYMBOL(p384_montjmixadd_alt):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        amontsqr_p384(zp2,z_1)
+        montmul_p384(y2a,z_1,y_2)
+
+        montmul_p384(x2a,zp2,x_2)
+        montmul_p384(y2a,zp2,y2a)
+
+        sub_p384(xd,x2a,x_1)
+        sub_p384(yd,y2a,y_1)
+
+        amontsqr_p384(zz,xd)
+        montsqr_p384(ww,yd)
+
+        montmul_p384(zzx1,zz,x_1)
+        montmul_p384(zzx2,zz,x2a)
+
+        sub_p384(resx,ww,zzx1)
+        sub_p384(t1,zzx2,zzx1)
+
+        montmul_p384(resz,xd,z_1)
+
+        sub_p384(resx,resx,zzx2)
+
+        sub_p384(t2,zzx1,resx)
+
+        montmul_p384(t1,t1,y_1)
+        montmul_p384(t2,yd,t2)
+
+        sub_p384(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        ldp     x4, x5, [z_1+32]
+        orr     x6, x0, x1
+        orr     x7, x2, x3
+        orr     x8, x4, x5
+        orr     x6, x6, x7
+        orr     x6, x6, x8
+        cmp     x6, xzr
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^384 - p_384),
+// hence giving 0 + p2 = p2 for the final result.
+
+        ldp     x0, x1, [resx]
+        ldp     x19, x20, [x_2]
+        csel    x0, x0, x19, ne
+        csel    x1, x1, x20, ne
+        ldp     x2, x3, [resx+16]
+        ldp     x19, x20, [x_2+16]
+        csel    x2, x2, x19, ne
+        csel    x3, x3, x20, ne
+        ldp     x4, x5, [resx+32]
+        ldp     x19, x20, [x_2+32]
+        csel    x4, x4, x19, ne
+        csel    x5, x5, x20, ne
+
+        ldp     x6, x7, [resy]
+        ldp     x19, x20, [y_2]
+        csel    x6, x6, x19, ne
+        csel    x7, x7, x20, ne
+        ldp     x8, x9, [resy+16]
+        ldp     x19, x20, [y_2+16]
+        csel    x8, x8, x19, ne
+        csel    x9, x9, x20, ne
+        ldp     x10, x11, [resy+32]
+        ldp     x19, x20, [y_2+32]
+        csel    x10, x10, x19, ne
+        csel    x11, x11, x20, ne
+
+        ldp     x12, x13, [resz]
+        mov     x19, #0xffffffff00000001
+        mov     x20, #0x00000000ffffffff
+        csel    x12, x12, x19, ne
+        csel    x13, x13, x20, ne
+        ldp     x14, x15, [resz+16]
+        mov     x19, #1
+        csel    x14, x14, x19, ne
+        csel    x15, x15, xzr, ne
+        ldp     x16, x17, [resz+32]
+        csel    x16, x16, xzr, ne
+        csel    x17, x17, xzr, ne
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [x_3+32]
+        stp     x6, x7, [y_3]
+        stp     x8, x9, [y_3+16]
+        stp     x10, x11, [y_3+32]
+        stp     x12, x13, [z_3]
+        stp     x14, x15, [z_3+16]
+        stp     x16, x17, [z_3+32]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+
+        ldp     x25, x26, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul.S
new file mode 100644
index 00000000000..4e92ae69a74
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul.S
@@ -0,0 +1,9988 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery-Jacobian form scalar multiplication for P-384
+// Input scalar[6], point[18]; output res[18]
+//
+// extern void p384_montjscalarmul
+//   (uint64_t res[static 18],
+//    uint64_t scalar[static 6],
+//    uint64_t point[static 18]);
+//
+// This function is a variant of its affine point version p384_scalarmul.
+// Here, input and output points are assumed to be in Jacobian form with
+// their coordinates in the Montgomery domain. Thus, if priming indicates
+// Montgomery form, x' = (2^384 * x) mod p_384 etc., each point argument
+// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when
+// z' is nonzero or the point at infinity (group identity) if z' = 0.
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve P-384, returns a representation of n * P. If the result is the
+// point at infinity (either because the input point was or because the
+// scalar was a multiple of p_384) then the output is guaranteed to
+// represent the point at infinity, i.e. to have its z coordinate zero.
+//
+// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjscalarmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjscalarmul)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 48
+#define JACSIZE (3*NUMSIZE)
+
+// Safe copies of input res and additional values in variables.
+
+#define bf x22
+#define sgn x23
+#define j x24
+#define res x25
+
+// Intermediate variables on the stack.
+// The table is 16 entries, each of size JACSIZE = 3 * NUMSIZE
+
+#define scalarb sp, #(0*NUMSIZE)
+#define acc sp, #(1*NUMSIZE)
+#define tabent sp, #(4*NUMSIZE)
+
+#define tab sp, #(7*NUMSIZE)
+
+#define NSPACE #(55*NUMSIZE)
+
+// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator,
+// which doesn't accept repetitions, assembler macros etc.
+
+#define selectblock(I)                            \
+        cmp     bf, #(1*I) __LF                      \
+        ldp     x20, x21, [x19] __LF                 \
+        csel    x0, x20, x0, eq __LF                 \
+        csel    x1, x21, x1, eq __LF                 \
+        ldp     x20, x21, [x19, #16] __LF            \
+        csel    x2, x20, x2, eq __LF                 \
+        csel    x3, x21, x3, eq __LF                 \
+        ldp     x20, x21, [x19, #32] __LF            \
+        csel    x4, x20, x4, eq __LF                 \
+        csel    x5, x21, x5, eq __LF                 \
+        ldp     x20, x21, [x19, #48] __LF            \
+        csel    x6, x20, x6, eq __LF                 \
+        csel    x7, x21, x7, eq __LF                 \
+        ldp     x20, x21, [x19, #64] __LF            \
+        csel    x8, x20, x8, eq __LF                 \
+        csel    x9, x21, x9, eq __LF                 \
+        ldp     x20, x21, [x19, #80] __LF            \
+        csel    x10, x20, x10, eq __LF               \
+        csel    x11, x21, x11, eq __LF               \
+        ldp     x20, x21, [x19, #96] __LF            \
+        csel    x12, x20, x12, eq __LF               \
+        csel    x13, x21, x13, eq __LF               \
+        ldp     x20, x21, [x19, #112] __LF           \
+        csel    x14, x20, x14, eq __LF               \
+        csel    x15, x21, x15, eq __LF               \
+        ldp     x20, x21, [x19, #128] __LF           \
+        csel    x16, x20, x16, eq __LF               \
+        csel    x17, x21, x17, eq __LF               \
+        add     x19, x19, #JACSIZE
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(p384_montjscalarmul):
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x30, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Preserve the "res" input argument; others get processed early.
+
+        mov     res, x0
+
+// Reduce the input scalar mod n_384, i.e. conditionally subtract n_384.
+// Store it to "scalarb".
+
+        ldp     x3, x4, [x1]
+        movbig(x15, #0xecec, #0x196a, #0xccc5, #0x2973)
+        ldp     x5, x6, [x1, #16]
+        movbig(x16, #0x581a, #0x0db2, #0x48b0, #0xa77a)
+        ldp     x7, x8, [x1, #32]
+        movbig(x17, #0xc763, #0x4d81, #0xf437, #0x2ddf)
+
+        subs    x9, x3, x15
+        sbcs    x10, x4, x16
+        sbcs    x11, x5, x17
+        adcs    x12, x6, xzr
+        adcs    x13, x7, xzr
+        adcs    x14, x8, xzr
+
+        csel    x3, x3, x9, cc
+        csel    x4, x4, x10, cc
+        csel    x5, x5, x11, cc
+        csel    x6, x6, x12, cc
+        csel    x7, x7, x13, cc
+        csel    x8, x8, x14, cc
+
+        stp     x3, x4, [scalarb]
+        stp     x5, x6, [scalarb+16]
+        stp     x7, x8, [scalarb+32]
+
+// Set the tab[0] table entry to the input point = 1 * P
+
+        ldp     x10, x11, [x2]
+        stp     x10, x11, [tab]
+        ldp     x12, x13, [x2, #16]
+        stp     x12, x13, [tab+16]
+        ldp     x14, x15, [x2, #32]
+        stp     x14, x15, [tab+32]
+
+        ldp     x10, x11, [x2, #48]
+        stp     x10, x11, [tab+48]
+        ldp     x12, x13, [x2, #64]
+        stp     x12, x13, [tab+64]
+        ldp     x14, x15, [x2, #80]
+        stp     x14, x15, [tab+80]
+
+        ldp     x10, x11, [x2, #96]
+        stp     x10, x11, [tab+96]
+        ldp     x12, x13, [x2, #112]
+        stp     x12, x13, [tab+112]
+        ldp     x14, x15, [x2, #128]
+        stp     x14, x15, [tab+128]
+
+// Compute and record tab[1] = 2 * p, ..., tab[15] = 16 * P
+
+        add     x0, tab+JACSIZE*1
+        add     x1, tab
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, tab+JACSIZE*2
+        add     x1, tab+JACSIZE*1
+        add     x2, tab
+        bl      p384_montjscalarmul_p384_montjadd
+
+        add     x0, tab+JACSIZE*3
+        add     x1, tab+JACSIZE*1
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, tab+JACSIZE*4
+        add     x1, tab+JACSIZE*3
+        add     x2, tab
+        bl      p384_montjscalarmul_p384_montjadd
+
+        add     x0, tab+JACSIZE*5
+        add     x1, tab+JACSIZE*2
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, tab+JACSIZE*6
+        add     x1, tab+JACSIZE*5
+        add     x2, tab
+        bl      p384_montjscalarmul_p384_montjadd
+
+        add     x0, tab+JACSIZE*7
+        add     x1, tab+JACSIZE*3
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, tab+JACSIZE*8
+        add     x1, tab+JACSIZE*7
+        add     x2, tab
+        bl      p384_montjscalarmul_p384_montjadd
+
+        add     x0, tab+JACSIZE*9
+        add     x1, tab+JACSIZE*4
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, tab+JACSIZE*10
+        add     x1, tab+JACSIZE*9
+        add     x2, tab
+        bl      p384_montjscalarmul_p384_montjadd
+
+        add     x0, tab+JACSIZE*11
+        add     x1, tab+JACSIZE*5
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, tab+JACSIZE*12
+        add     x1, tab+JACSIZE*11
+        add     x2, tab
+        bl      p384_montjscalarmul_p384_montjadd
+
+        add     x0, tab+JACSIZE*13
+        add     x1, tab+JACSIZE*6
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, tab+JACSIZE*14
+        add     x1, tab+JACSIZE*13
+        add     x2, tab
+        bl      p384_montjscalarmul_p384_montjadd
+
+        add     x0, tab+JACSIZE*15
+        add     x1, tab+JACSIZE*7
+        bl      p384_montjscalarmul_p384_montjdouble
+
+// Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed
+// digits. The digits of the constant, in lowest-to-highest order, are as
+// follows; they are generated dynamically since none is a simple ARM load.
+//
+// 0x0842108421084210
+// 0x1084210842108421
+// 0x2108421084210842
+// 0x4210842108421084
+// 0x8421084210842108
+// 0x0842108421084210
+
+        ldp     x0, x1, [scalarb]
+        ldp     x2, x3, [scalarb+16]
+        ldp     x4, x5, [scalarb+32]
+        movbig(x8, #0x1084, #0x2108, #0x4210, #0x8421)
+        adds    x0, x0, x8, lsr #1
+        adcs    x1, x1, x8
+        lsl     x8, x8, #1
+        adcs    x2, x2, x8
+        lsl     x8, x8, #1
+        adcs    x3, x3, x8
+        lsl     x8, x8, #1
+        adcs    x4, x4, x8
+        lsr     x8, x8, #4
+        adcs    x5, x5, x8
+        cset    x6, cs
+
+// Record the top bitfield then shift the whole scalar left 4 bits
+// to align the top of the next bitfield with the MSB (bits 379..383).
+
+        extr    bf, x6, x5, #60
+        extr    x5, x5, x4, #60
+        extr    x4, x4, x3, #60
+        extr    x3, x3, x2, #60
+        extr    x2, x2, x1, #60
+        extr    x1, x1, x0, #60
+        lsl     x0, x0, #4
+        stp     x0, x1, [scalarb]
+        stp     x2, x3, [scalarb+16]
+        stp     x4, x5, [scalarb+32]
+
+// Initialize the accumulator to the corresponding entry using constant-time
+// lookup in the table. This top digit, uniquely, is not recoded so there is
+// no sign adjustment to make.
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        mov     x12, xzr
+        mov     x13, xzr
+        mov     x14, xzr
+        mov     x15, xzr
+        mov     x16, xzr
+        mov     x17, xzr
+
+        add     x19, tab
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+        selectblock(9)
+        selectblock(10)
+        selectblock(11)
+        selectblock(12)
+        selectblock(13)
+        selectblock(14)
+        selectblock(15)
+        selectblock(16)
+
+        stp     x0, x1, [acc]
+        stp     x2, x3, [acc+16]
+        stp     x4, x5, [acc+32]
+        stp     x6, x7, [acc+48]
+        stp     x8, x9, [acc+64]
+        stp     x10, x11, [acc+80]
+        stp     x12, x13, [acc+96]
+        stp     x14, x15, [acc+112]
+        stp     x16, x17, [acc+128]
+
+        mov     j, #380
+
+// Main loop over size-5 bitfields: double 5 times then add signed digit
+// At each stage we shift the scalar left by 5 bits so we can simply pick
+// the top 5 bits as the bitfield, saving some fiddle over indexing.
+
+p384_montjscalarmul_mainloop:
+        sub     j, j, #5
+
+        add     x0, acc
+        add     x1, acc
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p384_montjscalarmul_p384_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      p384_montjscalarmul_p384_montjdouble
+
+// Choose the bitfield and adjust it to sign and magnitude
+
+        ldp     x0, x1, [scalarb]
+        ldp     x2, x3, [scalarb+16]
+        ldp     x4, x5, [scalarb+32]
+        lsr     bf, x5, #59
+        extr    x5, x5, x4, #59
+        extr    x4, x4, x3, #59
+        extr    x3, x3, x2, #59
+        extr    x2, x2, x1, #59
+        extr    x1, x1, x0, #59
+        lsl     x0, x0, #5
+        stp     x0, x1, [scalarb]
+        stp     x2, x3, [scalarb+16]
+        stp     x4, x5, [scalarb+32]
+
+        subs    bf, bf, #16
+        cset    sgn, lo                 // sgn = sign of digit (1 = negative)
+        cneg    bf, bf, lo              // bf = absolute value of digit
+
+// Conditionally select the table entry tab[i-1] = i * P in constant time
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        mov     x12, xzr
+        mov     x13, xzr
+        mov     x14, xzr
+        mov     x15, xzr
+        mov     x16, xzr
+        mov     x17, xzr
+
+        add     x19, tab
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+        selectblock(9)
+        selectblock(10)
+        selectblock(11)
+        selectblock(12)
+        selectblock(13)
+        selectblock(14)
+        selectblock(15)
+        selectblock(16)
+
+// Store it to "tabent" with the y coordinate optionally negated.
+// This is done carefully to give coordinates < p_384 even in
+// the degenerate case y = 0 (when z = 0 for points on the curve).
+
+        stp     x0, x1, [tabent]
+        stp     x2, x3, [tabent+16]
+        stp     x4, x5, [tabent+32]
+
+        stp     x12, x13, [tabent+96]
+        stp     x14, x15, [tabent+112]
+        stp     x16, x17, [tabent+128]
+
+        mov     x0, #0x00000000ffffffff
+        subs    x0, x0, x6
+        orr     x12, x6, x7
+        mov     x1, #0xffffffff00000000
+        sbcs    x1, x1, x7
+        orr     x13, x8, x9
+        mov     x2, #0xfffffffffffffffe
+        sbcs    x2, x2, x8
+        orr     x14, x10, x11
+        mov     x5, #0xffffffffffffffff
+        sbcs    x3, x5, x9
+        orr     x12, x12, x13
+        sbcs    x4, x5, x10
+        orr     x12, x12, x14
+        sbcs    x5, x5, x11
+
+        cmp     sgn, xzr
+        ccmp    x12, xzr, #4, ne
+
+        csel    x6, x0, x6, ne
+        csel    x7, x1, x7, ne
+        csel    x8, x2, x8, ne
+        csel    x9, x3, x9, ne
+        csel    x10, x4, x10, ne
+        csel    x11, x5, x11, ne
+
+        stp     x6, x7, [tabent+48]
+        stp     x8, x9, [tabent+64]
+        stp     x10, x11, [tabent+80]
+
+// Add to the accumulator
+
+        add     x0, acc
+        add     x1, acc
+        add     x2, tabent
+        bl      p384_montjscalarmul_p384_montjadd
+
+        cbnz    j, p384_montjscalarmul_mainloop
+
+// That's the end of the main loop, and we just need to copy the
+// result in "acc" to the output.
+
+        ldp     x0, x1, [acc]
+        stp     x0, x1, [res]
+        ldp     x0, x1, [acc+16]
+        stp     x0, x1, [res, #16]
+        ldp     x0, x1, [acc+32]
+        stp     x0, x1, [res, #32]
+        ldp     x0, x1, [acc+48]
+        stp     x0, x1, [res, #48]
+        ldp     x0, x1, [acc+64]
+        stp     x0, x1, [res, #64]
+        ldp     x0, x1, [acc+80]
+        stp     x0, x1, [res, #80]
+        ldp     x0, x1, [acc+96]
+        stp     x0, x1, [res, #96]
+        ldp     x0, x1, [acc+112]
+        stp     x0, x1, [res, #112]
+        ldp     x0, x1, [acc+128]
+        stp     x0, x1, [res, #128]
+
+// Restore stack and registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x25, x30, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+p384_montjscalarmul_p384_montjadd:
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+        stp     x27, xzr, [sp, #-16]!
+        sub     sp, sp, #0x180
+        mov     x24, x0
+        mov     x25, x1
+        mov     x26, x2
+        mov     x0, sp
+        ldr     q1, [x25, #96]
+        ldp     x9, x2, [x25, #96]
+        ldr     q0, [x25, #96]
+        ldp     x4, x6, [x25, #112]
+        rev64   v21.4s, v1.4s
+        uzp2    v28.4s, v1.4s, v1.4s
+        umulh   x7, x9, x2
+        xtn     v17.2s, v1.2d
+        mul     v27.4s, v21.4s, v0.4s
+        ldr     q20, [x25, #128]
+        xtn     v30.2s, v0.2d
+        ldr     q1, [x25, #128]
+        uzp2    v31.4s, v0.4s, v0.4s
+        ldp     x5, x10, [x25, #128]
+        umulh   x8, x9, x4
+        uaddlp  v3.2d, v27.4s
+        umull   v16.2d, v30.2s, v17.2s
+        mul     x16, x9, x4
+        umull   v27.2d, v30.2s, v28.2s
+        shrn    v0.2s, v20.2d, #32
+        xtn     v7.2s, v20.2d
+        shl     v20.2d, v3.2d, #32
+        umull   v3.2d, v31.2s, v28.2s
+        mul     x3, x2, x4
+        umlal   v20.2d, v30.2s, v17.2s
+        umull   v22.2d, v7.2s, v0.2s
+        usra    v27.2d, v16.2d, #32
+        umulh   x11, x2, x4
+        movi    v21.2d, #0xffffffff
+        uzp2    v28.4s, v1.4s, v1.4s
+        adds    x15, x16, x7
+        and     v5.16b, v27.16b, v21.16b
+        adcs    x3, x3, x8
+        usra    v3.2d, v27.2d, #32
+        dup     v29.2d, x6
+        adcs    x16, x11, xzr
+        mov     x14, v20.d[0]
+        umlal   v5.2d, v31.2s, v17.2s
+        mul     x8, x9, x2
+        mov     x7, v20.d[1]
+        shl     v19.2d, v22.2d, #33
+        xtn     v25.2s, v29.2d
+        rev64   v31.4s, v1.4s
+        lsl     x13, x14, #32
+        uzp2    v6.4s, v29.4s, v29.4s
+        umlal   v19.2d, v7.2s, v7.2s
+        usra    v3.2d, v5.2d, #32
+        adds    x1, x8, x8
+        umulh   x8, x4, x4
+        add     x12, x13, x14
+        mul     v17.4s, v31.4s, v29.4s
+        xtn     v4.2s, v1.2d
+        adcs    x14, x15, x15
+        lsr     x13, x12, #32
+        adcs    x15, x3, x3
+        umull   v31.2d, v25.2s, v28.2s
+        adcs    x11, x16, x16
+        umull   v21.2d, v25.2s, v4.2s
+        mov     x17, v3.d[0]
+        umull   v18.2d, v6.2s, v28.2s
+        adc     x16, x8, xzr
+        uaddlp  v16.2d, v17.4s
+        movi    v1.2d, #0xffffffff
+        subs    x13, x13, x12
+        usra    v31.2d, v21.2d, #32
+        sbc     x8, x12, xzr
+        adds    x17, x17, x1
+        mul     x1, x4, x4
+        shl     v28.2d, v16.2d, #32
+        mov     x3, v3.d[1]
+        adcs    x14, x7, x14
+        extr    x7, x8, x13, #32
+        adcs    x13, x3, x15
+        and     v3.16b, v31.16b, v1.16b
+        adcs    x11, x1, x11
+        lsr     x1, x8, #32
+        umlal   v3.2d, v6.2s, v4.2s
+        usra    v18.2d, v31.2d, #32
+        adc     x3, x16, xzr
+        adds    x1, x1, x12
+        umlal   v28.2d, v25.2s, v4.2s
+        adc     x16, xzr, xzr
+        subs    x15, x17, x7
+        sbcs    x7, x14, x1
+        lsl     x1, x15, #32
+        sbcs    x16, x13, x16
+        add     x8, x1, x15
+        usra    v18.2d, v3.2d, #32
+        sbcs    x14, x11, xzr
+        lsr     x1, x8, #32
+        sbcs    x17, x3, xzr
+        sbc     x11, x12, xzr
+        subs    x13, x1, x8
+        umulh   x12, x4, x10
+        sbc     x1, x8, xzr
+        extr    x13, x1, x13, #32
+        lsr     x1, x1, #32
+        adds    x15, x1, x8
+        adc     x1, xzr, xzr
+        subs    x7, x7, x13
+        sbcs    x13, x16, x15
+        lsl     x3, x7, #32
+        umulh   x16, x2, x5
+        sbcs    x15, x14, x1
+        add     x7, x3, x7
+        sbcs    x3, x17, xzr
+        lsr     x1, x7, #32
+        sbcs    x14, x11, xzr
+        sbc     x11, x8, xzr
+        subs    x8, x1, x7
+        sbc     x1, x7, xzr
+        extr    x8, x1, x8, #32
+        lsr     x1, x1, #32
+        adds    x1, x1, x7
+        adc     x17, xzr, xzr
+        subs    x13, x13, x8
+        umulh   x8, x9, x6
+        sbcs    x1, x15, x1
+        sbcs    x15, x3, x17
+        sbcs    x3, x14, xzr
+        mul     x17, x2, x5
+        sbcs    x11, x11, xzr
+        stp     x13, x1, [x0]
+        sbc     x14, x7, xzr
+        mul     x7, x4, x10
+        subs    x1, x9, x2
+        stp     x15, x3, [x0, #16]
+        csetm   x15, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        stp     x11, x14, [x0, #32]
+        mul     x14, x9, x6
+        adds    x17, x8, x17
+        adcs    x7, x16, x7
+        adc     x13, x12, xzr
+        subs    x12, x5, x6
+        cneg    x3, x12, cc  // cc = lo, ul, last
+        cinv    x16, x15, cc  // cc = lo, ul, last
+        mul     x8, x1, x3
+        umulh   x1, x1, x3
+        eor     x12, x8, x16
+        adds    x11, x17, x14
+        adcs    x3, x7, x17
+        adcs    x15, x13, x7
+        adc     x8, x13, xzr
+        adds    x3, x3, x14
+        adcs    x15, x15, x17
+        adcs    x17, x8, x7
+        eor     x1, x1, x16
+        adc     x13, x13, xzr
+        subs    x9, x9, x4
+        csetm   x8, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x4, x2, x4
+        cneg    x4, x4, cc  // cc = lo, ul, last
+        csetm   x7, cc  // cc = lo, ul, last
+        subs    x2, x10, x6
+        cinv    x8, x8, cc  // cc = lo, ul, last
+        cneg    x2, x2, cc  // cc = lo, ul, last
+        cmn     x16, #0x1
+        adcs    x11, x11, x12
+        mul     x12, x9, x2
+        adcs    x3, x3, x1
+        adcs    x15, x15, x16
+        umulh   x9, x9, x2
+        adcs    x17, x17, x16
+        adc     x13, x13, x16
+        subs    x1, x10, x5
+        cinv    x2, x7, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        eor     x9, x9, x8
+        cmn     x8, #0x1
+        eor     x7, x12, x8
+        mul     x12, x4, x1
+        adcs    x3, x3, x7
+        adcs    x7, x15, x9
+        adcs    x15, x17, x8
+        ldp     x9, x17, [x0, #16]
+        umulh   x4, x4, x1
+        adc     x8, x13, x8
+        cmn     x2, #0x1
+        eor     x1, x12, x2
+        adcs    x1, x7, x1
+        ldp     x7, x16, [x0]
+        eor     x12, x4, x2
+        adcs    x4, x15, x12
+        ldp     x15, x12, [x0, #32]
+        adc     x8, x8, x2
+        adds    x13, x14, x14
+        umulh   x14, x5, x10
+        adcs    x2, x11, x11
+        adcs    x3, x3, x3
+        adcs    x1, x1, x1
+        adcs    x4, x4, x4
+        adcs    x11, x8, x8
+        adc     x8, xzr, xzr
+        adds    x13, x13, x7
+        adcs    x2, x2, x16
+        mul     x16, x5, x10
+        adcs    x3, x3, x9
+        adcs    x1, x1, x17
+        umulh   x5, x5, x5
+        lsl     x9, x13, #32
+        add     x9, x9, x13
+        adcs    x4, x4, x15
+        mov     x13, v28.d[1]
+        adcs    x15, x11, x12
+        lsr     x7, x9, #32
+        adc     x11, x8, xzr
+        subs    x7, x7, x9
+        umulh   x10, x10, x10
+        sbc     x17, x9, xzr
+        extr    x7, x17, x7, #32
+        lsr     x17, x17, #32
+        adds    x17, x17, x9
+        adc     x12, xzr, xzr
+        subs    x8, x2, x7
+        sbcs    x17, x3, x17
+        lsl     x7, x8, #32
+        sbcs    x2, x1, x12
+        add     x3, x7, x8
+        sbcs    x12, x4, xzr
+        lsr     x1, x3, #32
+        sbcs    x7, x15, xzr
+        sbc     x15, x9, xzr
+        subs    x1, x1, x3
+        sbc     x4, x3, xzr
+        lsr     x9, x4, #32
+        extr    x8, x4, x1, #32
+        adds    x9, x9, x3
+        adc     x4, xzr, xzr
+        subs    x1, x17, x8
+        lsl     x17, x1, #32
+        sbcs    x8, x2, x9
+        sbcs    x9, x12, x4
+        add     x17, x17, x1
+        mov     x1, v18.d[1]
+        lsr     x2, x17, #32
+        sbcs    x7, x7, xzr
+        mov     x12, v18.d[0]
+        sbcs    x15, x15, xzr
+        sbc     x3, x3, xzr
+        subs    x4, x2, x17
+        sbc     x2, x17, xzr
+        adds    x12, x13, x12
+        adcs    x16, x16, x1
+        lsr     x13, x2, #32
+        extr    x1, x2, x4, #32
+        adc     x2, x14, xzr
+        adds    x4, x13, x17
+        mul     x13, x6, x6
+        adc     x14, xzr, xzr
+        subs    x1, x8, x1
+        sbcs    x4, x9, x4
+        mov     x9, v28.d[0]
+        sbcs    x7, x7, x14
+        sbcs    x8, x15, xzr
+        sbcs    x3, x3, xzr
+        sbc     x14, x17, xzr
+        adds    x17, x9, x9
+        adcs    x12, x12, x12
+        mov     x15, v19.d[0]
+        adcs    x9, x16, x16
+        umulh   x6, x6, x6
+        adcs    x16, x2, x2
+        adc     x2, xzr, xzr
+        adds    x11, x11, x8
+        adcs    x3, x3, xzr
+        adcs    x14, x14, xzr
+        adcs    x8, xzr, xzr
+        adds    x13, x1, x13
+        mov     x1, v19.d[1]
+        adcs    x6, x4, x6
+        mov     x4, #0xffffffff                 // #4294967295
+        adcs    x15, x7, x15
+        adcs    x7, x11, x5
+        adcs    x1, x3, x1
+        adcs    x14, x14, x10
+        adc     x11, x8, xzr
+        adds    x6, x6, x17
+        adcs    x8, x15, x12
+        adcs    x3, x7, x9
+        adcs    x15, x1, x16
+        mov     x16, #0xffffffff00000001        // #-4294967295
+        adcs    x14, x14, x2
+        mov     x2, #0x1                        // #1
+        adc     x17, x11, xzr
+        cmn     x13, x16
+        adcs    xzr, x6, x4
+        adcs    xzr, x8, x2
+        adcs    xzr, x3, xzr
+        adcs    xzr, x15, xzr
+        adcs    xzr, x14, xzr
+        adc     x1, x17, xzr
+        neg     x9, x1
+        and     x1, x16, x9
+        adds    x11, x13, x1
+        and     x13, x4, x9
+        adcs    x5, x6, x13
+        and     x1, x2, x9
+        adcs    x7, x8, x1
+        stp     x11, x5, [x0]
+        adcs    x11, x3, xzr
+        adcs    x2, x15, xzr
+        stp     x7, x11, [x0, #16]
+        adc     x17, x14, xzr
+        stp     x2, x17, [x0, #32]
+        ldr     q1, [x26, #96]
+        ldp     x9, x2, [x26, #96]
+        ldr     q0, [x26, #96]
+        ldp     x4, x6, [x26, #112]
+        rev64   v21.4s, v1.4s
+        uzp2    v28.4s, v1.4s, v1.4s
+        umulh   x7, x9, x2
+        xtn     v17.2s, v1.2d
+        mul     v27.4s, v21.4s, v0.4s
+        ldr     q20, [x26, #128]
+        xtn     v30.2s, v0.2d
+        ldr     q1, [x26, #128]
+        uzp2    v31.4s, v0.4s, v0.4s
+        ldp     x5, x10, [x26, #128]
+        umulh   x8, x9, x4
+        uaddlp  v3.2d, v27.4s
+        umull   v16.2d, v30.2s, v17.2s
+        mul     x16, x9, x4
+        umull   v27.2d, v30.2s, v28.2s
+        shrn    v0.2s, v20.2d, #32
+        xtn     v7.2s, v20.2d
+        shl     v20.2d, v3.2d, #32
+        umull   v3.2d, v31.2s, v28.2s
+        mul     x3, x2, x4
+        umlal   v20.2d, v30.2s, v17.2s
+        umull   v22.2d, v7.2s, v0.2s
+        usra    v27.2d, v16.2d, #32
+        umulh   x11, x2, x4
+        movi    v21.2d, #0xffffffff
+        uzp2    v28.4s, v1.4s, v1.4s
+        adds    x15, x16, x7
+        and     v5.16b, v27.16b, v21.16b
+        adcs    x3, x3, x8
+        usra    v3.2d, v27.2d, #32
+        dup     v29.2d, x6
+        adcs    x16, x11, xzr
+        mov     x14, v20.d[0]
+        umlal   v5.2d, v31.2s, v17.2s
+        mul     x8, x9, x2
+        mov     x7, v20.d[1]
+        shl     v19.2d, v22.2d, #33
+        xtn     v25.2s, v29.2d
+        rev64   v31.4s, v1.4s
+        lsl     x13, x14, #32
+        uzp2    v6.4s, v29.4s, v29.4s
+        umlal   v19.2d, v7.2s, v7.2s
+        usra    v3.2d, v5.2d, #32
+        adds    x1, x8, x8
+        umulh   x8, x4, x4
+        add     x12, x13, x14
+        mul     v17.4s, v31.4s, v29.4s
+        xtn     v4.2s, v1.2d
+        adcs    x14, x15, x15
+        lsr     x13, x12, #32
+        adcs    x15, x3, x3
+        umull   v31.2d, v25.2s, v28.2s
+        adcs    x11, x16, x16
+        umull   v21.2d, v25.2s, v4.2s
+        mov     x17, v3.d[0]
+        umull   v18.2d, v6.2s, v28.2s
+        adc     x16, x8, xzr
+        uaddlp  v16.2d, v17.4s
+        movi    v1.2d, #0xffffffff
+        subs    x13, x13, x12
+        usra    v31.2d, v21.2d, #32
+        sbc     x8, x12, xzr
+        adds    x17, x17, x1
+        mul     x1, x4, x4
+        shl     v28.2d, v16.2d, #32
+        mov     x3, v3.d[1]
+        adcs    x14, x7, x14
+        extr    x7, x8, x13, #32
+        adcs    x13, x3, x15
+        and     v3.16b, v31.16b, v1.16b
+        adcs    x11, x1, x11
+        lsr     x1, x8, #32
+        umlal   v3.2d, v6.2s, v4.2s
+        usra    v18.2d, v31.2d, #32
+        adc     x3, x16, xzr
+        adds    x1, x1, x12
+        umlal   v28.2d, v25.2s, v4.2s
+        adc     x16, xzr, xzr
+        subs    x15, x17, x7
+        sbcs    x7, x14, x1
+        lsl     x1, x15, #32
+        sbcs    x16, x13, x16
+        add     x8, x1, x15
+        usra    v18.2d, v3.2d, #32
+        sbcs    x14, x11, xzr
+        lsr     x1, x8, #32
+        sbcs    x17, x3, xzr
+        sbc     x11, x12, xzr
+        subs    x13, x1, x8
+        umulh   x12, x4, x10
+        sbc     x1, x8, xzr
+        extr    x13, x1, x13, #32
+        lsr     x1, x1, #32
+        adds    x15, x1, x8
+        adc     x1, xzr, xzr
+        subs    x7, x7, x13
+        sbcs    x13, x16, x15
+        lsl     x3, x7, #32
+        umulh   x16, x2, x5
+        sbcs    x15, x14, x1
+        add     x7, x3, x7
+        sbcs    x3, x17, xzr
+        lsr     x1, x7, #32
+        sbcs    x14, x11, xzr
+        sbc     x11, x8, xzr
+        subs    x8, x1, x7
+        sbc     x1, x7, xzr
+        extr    x8, x1, x8, #32
+        lsr     x1, x1, #32
+        adds    x1, x1, x7
+        adc     x17, xzr, xzr
+        subs    x13, x13, x8
+        umulh   x8, x9, x6
+        sbcs    x1, x15, x1
+        sbcs    x15, x3, x17
+        sbcs    x3, x14, xzr
+        mul     x17, x2, x5
+        sbcs    x11, x11, xzr
+        stp     x13, x1, [sp, #240]
+        sbc     x14, x7, xzr
+        mul     x7, x4, x10
+        subs    x1, x9, x2
+        stp     x15, x3, [sp, #256]
+        csetm   x15, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        stp     x11, x14, [sp, #272]
+        mul     x14, x9, x6
+        adds    x17, x8, x17
+        adcs    x7, x16, x7
+        adc     x13, x12, xzr
+        subs    x12, x5, x6
+        cneg    x3, x12, cc  // cc = lo, ul, last
+        cinv    x16, x15, cc  // cc = lo, ul, last
+        mul     x8, x1, x3
+        umulh   x1, x1, x3
+        eor     x12, x8, x16
+        adds    x11, x17, x14
+        adcs    x3, x7, x17
+        adcs    x15, x13, x7
+        adc     x8, x13, xzr
+        adds    x3, x3, x14
+        adcs    x15, x15, x17
+        adcs    x17, x8, x7
+        eor     x1, x1, x16
+        adc     x13, x13, xzr
+        subs    x9, x9, x4
+        csetm   x8, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x4, x2, x4
+        cneg    x4, x4, cc  // cc = lo, ul, last
+        csetm   x7, cc  // cc = lo, ul, last
+        subs    x2, x10, x6
+        cinv    x8, x8, cc  // cc = lo, ul, last
+        cneg    x2, x2, cc  // cc = lo, ul, last
+        cmn     x16, #0x1
+        adcs    x11, x11, x12
+        mul     x12, x9, x2
+        adcs    x3, x3, x1
+        adcs    x15, x15, x16
+        umulh   x9, x9, x2
+        adcs    x17, x17, x16
+        adc     x13, x13, x16
+        subs    x1, x10, x5
+        cinv    x2, x7, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        eor     x9, x9, x8
+        cmn     x8, #0x1
+        eor     x7, x12, x8
+        mul     x12, x4, x1
+        adcs    x3, x3, x7
+        adcs    x7, x15, x9
+        adcs    x15, x17, x8
+        ldp     x9, x17, [sp, #256]
+        umulh   x4, x4, x1
+        adc     x8, x13, x8
+        cmn     x2, #0x1
+        eor     x1, x12, x2
+        adcs    x1, x7, x1
+        ldp     x7, x16, [sp, #240]
+        eor     x12, x4, x2
+        adcs    x4, x15, x12
+        ldp     x15, x12, [sp, #272]
+        adc     x8, x8, x2
+        adds    x13, x14, x14
+        umulh   x14, x5, x10
+        adcs    x2, x11, x11
+        adcs    x3, x3, x3
+        adcs    x1, x1, x1
+        adcs    x4, x4, x4
+        adcs    x11, x8, x8
+        adc     x8, xzr, xzr
+        adds    x13, x13, x7
+        adcs    x2, x2, x16
+        mul     x16, x5, x10
+        adcs    x3, x3, x9
+        adcs    x1, x1, x17
+        umulh   x5, x5, x5
+        lsl     x9, x13, #32
+        add     x9, x9, x13
+        adcs    x4, x4, x15
+        mov     x13, v28.d[1]
+        adcs    x15, x11, x12
+        lsr     x7, x9, #32
+        adc     x11, x8, xzr
+        subs    x7, x7, x9
+        umulh   x10, x10, x10
+        sbc     x17, x9, xzr
+        extr    x7, x17, x7, #32
+        lsr     x17, x17, #32
+        adds    x17, x17, x9
+        adc     x12, xzr, xzr
+        subs    x8, x2, x7
+        sbcs    x17, x3, x17
+        lsl     x7, x8, #32
+        sbcs    x2, x1, x12
+        add     x3, x7, x8
+        sbcs    x12, x4, xzr
+        lsr     x1, x3, #32
+        sbcs    x7, x15, xzr
+        sbc     x15, x9, xzr
+        subs    x1, x1, x3
+        sbc     x4, x3, xzr
+        lsr     x9, x4, #32
+        extr    x8, x4, x1, #32
+        adds    x9, x9, x3
+        adc     x4, xzr, xzr
+        subs    x1, x17, x8
+        lsl     x17, x1, #32
+        sbcs    x8, x2, x9
+        sbcs    x9, x12, x4
+        add     x17, x17, x1
+        mov     x1, v18.d[1]
+        lsr     x2, x17, #32
+        sbcs    x7, x7, xzr
+        mov     x12, v18.d[0]
+        sbcs    x15, x15, xzr
+        sbc     x3, x3, xzr
+        subs    x4, x2, x17
+        sbc     x2, x17, xzr
+        adds    x12, x13, x12
+        adcs    x16, x16, x1
+        lsr     x13, x2, #32
+        extr    x1, x2, x4, #32
+        adc     x2, x14, xzr
+        adds    x4, x13, x17
+        mul     x13, x6, x6
+        adc     x14, xzr, xzr
+        subs    x1, x8, x1
+        sbcs    x4, x9, x4
+        mov     x9, v28.d[0]
+        sbcs    x7, x7, x14
+        sbcs    x8, x15, xzr
+        sbcs    x3, x3, xzr
+        sbc     x14, x17, xzr
+        adds    x17, x9, x9
+        adcs    x12, x12, x12
+        mov     x15, v19.d[0]
+        adcs    x9, x16, x16
+        umulh   x6, x6, x6
+        adcs    x16, x2, x2
+        adc     x2, xzr, xzr
+        adds    x11, x11, x8
+        adcs    x3, x3, xzr
+        adcs    x14, x14, xzr
+        adcs    x8, xzr, xzr
+        adds    x13, x1, x13
+        mov     x1, v19.d[1]
+        adcs    x6, x4, x6
+        mov     x4, #0xffffffff                 // #4294967295
+        adcs    x15, x7, x15
+        adcs    x7, x11, x5
+        adcs    x1, x3, x1
+        adcs    x14, x14, x10
+        adc     x11, x8, xzr
+        adds    x6, x6, x17
+        adcs    x8, x15, x12
+        adcs    x3, x7, x9
+        adcs    x15, x1, x16
+        mov     x16, #0xffffffff00000001        // #-4294967295
+        adcs    x14, x14, x2
+        mov     x2, #0x1                        // #1
+        adc     x17, x11, xzr
+        cmn     x13, x16
+        adcs    xzr, x6, x4
+        adcs    xzr, x8, x2
+        adcs    xzr, x3, xzr
+        adcs    xzr, x15, xzr
+        adcs    xzr, x14, xzr
+        adc     x1, x17, xzr
+        neg     x9, x1
+        and     x1, x16, x9
+        adds    x11, x13, x1
+        and     x13, x4, x9
+        adcs    x5, x6, x13
+        and     x1, x2, x9
+        adcs    x7, x8, x1
+        stp     x11, x5, [sp, #240]
+        adcs    x11, x3, xzr
+        adcs    x2, x15, xzr
+        stp     x7, x11, [sp, #256]
+        adc     x17, x14, xzr
+        stp     x2, x17, [sp, #272]
+        stp     x23, x24, [sp, #0x150]
+        ldr     q3, [x26, #96]
+        ldr     q25, [x25, #48]
+        ldp     x13, x23, [x25, #48]
+        ldp     x3, x21, [x26, #96]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [x25, #80]
+        ldp     x8, x24, [x26, #112]
+        subs    x6, x3, x21
+        ldr     q0, [x26, #128]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [x25, #64]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [x25, #80]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [x26, #128]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #288]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #304]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #320]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #288]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #304]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #320]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #288]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #304]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #320]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #288]
+        ldp     x21, x12, [sp, #304]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #320]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #288]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #304]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #320]
+        ldr     q3, [x25, #96]
+        ldr     q25, [x26, #48]
+        ldp     x13, x23, [x26, #48]
+        ldp     x3, x21, [x25, #96]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [x26, #80]
+        ldp     x8, x24, [x25, #112]
+        subs    x6, x3, x21
+        ldr     q0, [x25, #128]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [x26, #64]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [x26, #80]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [x25, #128]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #48]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #64]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #80]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #48]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #64]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #80]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #48]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #64]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #80]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #48]
+        ldp     x21, x12, [sp, #64]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #80]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #48]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #64]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #80]
+        mov     x1, sp
+        ldr     q3, [x1]
+        ldr     q25, [x26]
+        ldp     x13, x23, [x26]
+        ldp     x3, x21, [x1]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [x26, #32]
+        ldp     x8, x24, [x1, #16]
+        subs    x6, x3, x21
+        ldr     q0, [x1, #32]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [x26, #16]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [x26, #32]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [x1, #32]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #96]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #112]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #128]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #96]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #112]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #128]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #96]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #112]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #128]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #96]
+        ldp     x21, x12, [sp, #112]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #128]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #96]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #112]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #128]
+        ldr     q3, [sp, #240]
+        ldr     q25, [x25]
+        ldp     x13, x23, [x25]
+        ldp     x3, x21, [sp, #240]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [x25, #32]
+        ldp     x8, x24, [sp, #256]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #272]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [x25, #16]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [x25, #32]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #272]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #192]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #208]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #224]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #192]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #208]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #224]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #192]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #208]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #224]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #192]
+        ldp     x21, x12, [sp, #208]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #224]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #192]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #208]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #224]
+        mov     x1, sp
+        ldr     q3, [x1]
+        ldr     q25, [sp, #48]
+        ldp     x13, x23, [sp, #48]
+        ldp     x3, x21, [x1]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [sp, #80]
+        ldp     x8, x24, [x1, #16]
+        subs    x6, x3, x21
+        ldr     q0, [x1, #32]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [sp, #64]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [sp, #80]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [x1, #32]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #48]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #64]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #80]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #48]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #64]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #80]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #48]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #64]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #80]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #48]
+        ldp     x21, x12, [sp, #64]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #80]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #48]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #64]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #80]
+        ldr     q3, [sp, #240]
+        ldr     q25, [sp, #288]
+        ldp     x13, x23, [sp, #288]
+        ldp     x3, x21, [sp, #240]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [sp, #320]
+        ldp     x8, x24, [sp, #256]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #272]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [sp, #304]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [sp, #320]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #272]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #288]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #304]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #320]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #288]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #304]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #320]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #288]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #304]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #320]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #288]
+        ldp     x21, x12, [sp, #304]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #320]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x2, x24, x11
+        stp     x22, x5, [sp, #288]
+        adcs    x11, x13, x23
+        adcs    x12, x8, x23
+        stp     x2, x11, [sp, #304]
+        adc     x13, x15, x23
+        stp     x12, x13, [sp, #320]
+        ldp     x5, x6, [sp, #96]
+        ldp     x4, x3, [sp, #192]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #112]
+        ldp     x4, x3, [sp, #208]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        ldp     x9, x10, [sp, #128]
+        ldp     x4, x3, [sp, #224]
+        sbcs    x9, x9, x4
+        sbcs    x10, x10, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x5, x5, x4
+        eor     x4, x4, x3
+        adcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x5, x6, [sp, #240]
+        stp     x7, x8, [sp, #256]
+        stp     x9, x10, [sp, #272]
+        ldp     x5, x6, [sp, #48]
+        ldp     x4, x3, [sp, #288]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #64]
+        sbcs    x7, x7, x2
+        sbcs    x8, x8, x11
+        ldp     x9, x10, [sp, #80]
+        sbcs    x9, x9, x12
+        sbcs    x10, x10, x13
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x5, x5, x4
+        eor     x4, x4, x3
+        adcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x5, x6, [sp, #48]
+        stp     x7, x8, [sp, #64]
+        stp     x9, x10, [sp, #80]
+        ldr     q1, [sp, #240]
+        ldp     x9, x2, [sp, #240]
+        ldr     q0, [sp, #240]
+        ldp     x4, x6, [sp, #256]
+        rev64   v21.4s, v1.4s
+        uzp2    v28.4s, v1.4s, v1.4s
+        umulh   x7, x9, x2
+        xtn     v17.2s, v1.2d
+        mul     v27.4s, v21.4s, v0.4s
+        ldr     q20, [sp, #272]
+        xtn     v30.2s, v0.2d
+        ldr     q1, [sp, #272]
+        uzp2    v31.4s, v0.4s, v0.4s
+        ldp     x5, x10, [sp, #272]
+        umulh   x8, x9, x4
+        uaddlp  v3.2d, v27.4s
+        umull   v16.2d, v30.2s, v17.2s
+        mul     x16, x9, x4
+        umull   v27.2d, v30.2s, v28.2s
+        shrn    v0.2s, v20.2d, #32
+        xtn     v7.2s, v20.2d
+        shl     v20.2d, v3.2d, #32
+        umull   v3.2d, v31.2s, v28.2s
+        mul     x3, x2, x4
+        umlal   v20.2d, v30.2s, v17.2s
+        umull   v22.2d, v7.2s, v0.2s
+        usra    v27.2d, v16.2d, #32
+        umulh   x11, x2, x4
+        movi    v21.2d, #0xffffffff
+        uzp2    v28.4s, v1.4s, v1.4s
+        adds    x15, x16, x7
+        and     v5.16b, v27.16b, v21.16b
+        adcs    x3, x3, x8
+        usra    v3.2d, v27.2d, #32
+        dup     v29.2d, x6
+        adcs    x16, x11, xzr
+        mov     x14, v20.d[0]
+        umlal   v5.2d, v31.2s, v17.2s
+        mul     x8, x9, x2
+        mov     x7, v20.d[1]
+        shl     v19.2d, v22.2d, #33
+        xtn     v25.2s, v29.2d
+        rev64   v31.4s, v1.4s
+        lsl     x13, x14, #32
+        uzp2    v6.4s, v29.4s, v29.4s
+        umlal   v19.2d, v7.2s, v7.2s
+        usra    v3.2d, v5.2d, #32
+        adds    x1, x8, x8
+        umulh   x8, x4, x4
+        add     x12, x13, x14
+        mul     v17.4s, v31.4s, v29.4s
+        xtn     v4.2s, v1.2d
+        adcs    x14, x15, x15
+        lsr     x13, x12, #32
+        adcs    x15, x3, x3
+        umull   v31.2d, v25.2s, v28.2s
+        adcs    x11, x16, x16
+        umull   v21.2d, v25.2s, v4.2s
+        mov     x17, v3.d[0]
+        umull   v18.2d, v6.2s, v28.2s
+        adc     x16, x8, xzr
+        uaddlp  v16.2d, v17.4s
+        movi    v1.2d, #0xffffffff
+        subs    x13, x13, x12
+        usra    v31.2d, v21.2d, #32
+        sbc     x8, x12, xzr
+        adds    x17, x17, x1
+        mul     x1, x4, x4
+        shl     v28.2d, v16.2d, #32
+        mov     x3, v3.d[1]
+        adcs    x14, x7, x14
+        extr    x7, x8, x13, #32
+        adcs    x13, x3, x15
+        and     v3.16b, v31.16b, v1.16b
+        adcs    x11, x1, x11
+        lsr     x1, x8, #32
+        umlal   v3.2d, v6.2s, v4.2s
+        usra    v18.2d, v31.2d, #32
+        adc     x3, x16, xzr
+        adds    x1, x1, x12
+        umlal   v28.2d, v25.2s, v4.2s
+        adc     x16, xzr, xzr
+        subs    x15, x17, x7
+        sbcs    x7, x14, x1
+        lsl     x1, x15, #32
+        sbcs    x16, x13, x16
+        add     x8, x1, x15
+        usra    v18.2d, v3.2d, #32
+        sbcs    x14, x11, xzr
+        lsr     x1, x8, #32
+        sbcs    x17, x3, xzr
+        sbc     x11, x12, xzr
+        subs    x13, x1, x8
+        umulh   x12, x4, x10
+        sbc     x1, x8, xzr
+        extr    x13, x1, x13, #32
+        lsr     x1, x1, #32
+        adds    x15, x1, x8
+        adc     x1, xzr, xzr
+        subs    x7, x7, x13
+        sbcs    x13, x16, x15
+        lsl     x3, x7, #32
+        umulh   x16, x2, x5
+        sbcs    x15, x14, x1
+        add     x7, x3, x7
+        sbcs    x3, x17, xzr
+        lsr     x1, x7, #32
+        sbcs    x14, x11, xzr
+        sbc     x11, x8, xzr
+        subs    x8, x1, x7
+        sbc     x1, x7, xzr
+        extr    x8, x1, x8, #32
+        lsr     x1, x1, #32
+        adds    x1, x1, x7
+        adc     x17, xzr, xzr
+        subs    x13, x13, x8
+        umulh   x8, x9, x6
+        sbcs    x1, x15, x1
+        sbcs    x15, x3, x17
+        sbcs    x3, x14, xzr
+        mul     x17, x2, x5
+        sbcs    x11, x11, xzr
+        stp     x13, x1, [sp, #144]
+        sbc     x14, x7, xzr
+        mul     x7, x4, x10
+        subs    x1, x9, x2
+        stp     x15, x3, [sp, #160]
+        csetm   x15, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        stp     x11, x14, [sp, #176]
+        mul     x14, x9, x6
+        adds    x17, x8, x17
+        adcs    x7, x16, x7
+        adc     x13, x12, xzr
+        subs    x12, x5, x6
+        cneg    x3, x12, cc  // cc = lo, ul, last
+        cinv    x16, x15, cc  // cc = lo, ul, last
+        mul     x8, x1, x3
+        umulh   x1, x1, x3
+        eor     x12, x8, x16
+        adds    x11, x17, x14
+        adcs    x3, x7, x17
+        adcs    x15, x13, x7
+        adc     x8, x13, xzr
+        adds    x3, x3, x14
+        adcs    x15, x15, x17
+        adcs    x17, x8, x7
+        eor     x1, x1, x16
+        adc     x13, x13, xzr
+        subs    x9, x9, x4
+        csetm   x8, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x4, x2, x4
+        cneg    x4, x4, cc  // cc = lo, ul, last
+        csetm   x7, cc  // cc = lo, ul, last
+        subs    x2, x10, x6
+        cinv    x8, x8, cc  // cc = lo, ul, last
+        cneg    x2, x2, cc  // cc = lo, ul, last
+        cmn     x16, #0x1
+        adcs    x11, x11, x12
+        mul     x12, x9, x2
+        adcs    x3, x3, x1
+        adcs    x15, x15, x16
+        umulh   x9, x9, x2
+        adcs    x17, x17, x16
+        adc     x13, x13, x16
+        subs    x1, x10, x5
+        cinv    x2, x7, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        eor     x9, x9, x8
+        cmn     x8, #0x1
+        eor     x7, x12, x8
+        mul     x12, x4, x1
+        adcs    x3, x3, x7
+        adcs    x7, x15, x9
+        adcs    x15, x17, x8
+        ldp     x9, x17, [sp, #160]
+        umulh   x4, x4, x1
+        adc     x8, x13, x8
+        cmn     x2, #0x1
+        eor     x1, x12, x2
+        adcs    x1, x7, x1
+        ldp     x7, x16, [sp, #144]
+        eor     x12, x4, x2
+        adcs    x4, x15, x12
+        ldp     x15, x12, [sp, #176]
+        adc     x8, x8, x2
+        adds    x13, x14, x14
+        umulh   x14, x5, x10
+        adcs    x2, x11, x11
+        adcs    x3, x3, x3
+        adcs    x1, x1, x1
+        adcs    x4, x4, x4
+        adcs    x11, x8, x8
+        adc     x8, xzr, xzr
+        adds    x13, x13, x7
+        adcs    x2, x2, x16
+        mul     x16, x5, x10
+        adcs    x3, x3, x9
+        adcs    x1, x1, x17
+        umulh   x5, x5, x5
+        lsl     x9, x13, #32
+        add     x9, x9, x13
+        adcs    x4, x4, x15
+        mov     x13, v28.d[1]
+        adcs    x15, x11, x12
+        lsr     x7, x9, #32
+        adc     x11, x8, xzr
+        subs    x7, x7, x9
+        umulh   x10, x10, x10
+        sbc     x17, x9, xzr
+        extr    x7, x17, x7, #32
+        lsr     x17, x17, #32
+        adds    x17, x17, x9
+        adc     x12, xzr, xzr
+        subs    x8, x2, x7
+        sbcs    x17, x3, x17
+        lsl     x7, x8, #32
+        sbcs    x2, x1, x12
+        add     x3, x7, x8
+        sbcs    x12, x4, xzr
+        lsr     x1, x3, #32
+        sbcs    x7, x15, xzr
+        sbc     x15, x9, xzr
+        subs    x1, x1, x3
+        sbc     x4, x3, xzr
+        lsr     x9, x4, #32
+        extr    x8, x4, x1, #32
+        adds    x9, x9, x3
+        adc     x4, xzr, xzr
+        subs    x1, x17, x8
+        lsl     x17, x1, #32
+        sbcs    x8, x2, x9
+        sbcs    x9, x12, x4
+        add     x17, x17, x1
+        mov     x1, v18.d[1]
+        lsr     x2, x17, #32
+        sbcs    x7, x7, xzr
+        mov     x12, v18.d[0]
+        sbcs    x15, x15, xzr
+        sbc     x3, x3, xzr
+        subs    x4, x2, x17
+        sbc     x2, x17, xzr
+        adds    x12, x13, x12
+        adcs    x16, x16, x1
+        lsr     x13, x2, #32
+        extr    x1, x2, x4, #32
+        adc     x2, x14, xzr
+        adds    x4, x13, x17
+        mul     x13, x6, x6
+        adc     x14, xzr, xzr
+        subs    x1, x8, x1
+        sbcs    x4, x9, x4
+        mov     x9, v28.d[0]
+        sbcs    x7, x7, x14
+        sbcs    x8, x15, xzr
+        sbcs    x3, x3, xzr
+        sbc     x14, x17, xzr
+        adds    x17, x9, x9
+        adcs    x12, x12, x12
+        mov     x15, v19.d[0]
+        adcs    x9, x16, x16
+        umulh   x6, x6, x6
+        adcs    x16, x2, x2
+        adc     x2, xzr, xzr
+        adds    x11, x11, x8
+        adcs    x3, x3, xzr
+        adcs    x14, x14, xzr
+        adcs    x8, xzr, xzr
+        adds    x13, x1, x13
+        mov     x1, v19.d[1]
+        adcs    x6, x4, x6
+        mov     x4, #0xffffffff                 // #4294967295
+        adcs    x15, x7, x15
+        adcs    x7, x11, x5
+        adcs    x1, x3, x1
+        adcs    x14, x14, x10
+        adc     x11, x8, xzr
+        adds    x6, x6, x17
+        adcs    x8, x15, x12
+        adcs    x3, x7, x9
+        adcs    x15, x1, x16
+        mov     x16, #0xffffffff00000001        // #-4294967295
+        adcs    x14, x14, x2
+        mov     x2, #0x1                        // #1
+        adc     x17, x11, xzr
+        cmn     x13, x16
+        adcs    xzr, x6, x4
+        adcs    xzr, x8, x2
+        adcs    xzr, x3, xzr
+        adcs    xzr, x15, xzr
+        adcs    xzr, x14, xzr
+        adc     x1, x17, xzr
+        neg     x9, x1
+        and     x1, x16, x9
+        adds    x11, x13, x1
+        and     x13, x4, x9
+        adcs    x5, x6, x13
+        and     x1, x2, x9
+        adcs    x7, x8, x1
+        stp     x11, x5, [sp, #144]
+        adcs    x11, x3, xzr
+        adcs    x2, x15, xzr
+        stp     x7, x11, [sp, #160]
+        adc     x17, x14, xzr
+        stp     x2, x17, [sp, #176]
+        mov     x0, sp
+        ldr     q1, [sp, #48]
+        ldp     x9, x2, [sp, #48]
+        ldr     q0, [sp, #48]
+        ldp     x4, x6, [sp, #64]
+        rev64   v21.4s, v1.4s
+        uzp2    v28.4s, v1.4s, v1.4s
+        umulh   x7, x9, x2
+        xtn     v17.2s, v1.2d
+        mul     v27.4s, v21.4s, v0.4s
+        ldr     q20, [sp, #80]
+        xtn     v30.2s, v0.2d
+        ldr     q1, [sp, #80]
+        uzp2    v31.4s, v0.4s, v0.4s
+        ldp     x5, x10, [sp, #80]
+        umulh   x8, x9, x4
+        uaddlp  v3.2d, v27.4s
+        umull   v16.2d, v30.2s, v17.2s
+        mul     x16, x9, x4
+        umull   v27.2d, v30.2s, v28.2s
+        shrn    v0.2s, v20.2d, #32
+        xtn     v7.2s, v20.2d
+        shl     v20.2d, v3.2d, #32
+        umull   v3.2d, v31.2s, v28.2s
+        mul     x3, x2, x4
+        umlal   v20.2d, v30.2s, v17.2s
+        umull   v22.2d, v7.2s, v0.2s
+        usra    v27.2d, v16.2d, #32
+        umulh   x11, x2, x4
+        movi    v21.2d, #0xffffffff
+        uzp2    v28.4s, v1.4s, v1.4s
+        adds    x15, x16, x7
+        and     v5.16b, v27.16b, v21.16b
+        adcs    x3, x3, x8
+        usra    v3.2d, v27.2d, #32
+        dup     v29.2d, x6
+        adcs    x16, x11, xzr
+        mov     x14, v20.d[0]
+        umlal   v5.2d, v31.2s, v17.2s
+        mul     x8, x9, x2
+        mov     x7, v20.d[1]
+        shl     v19.2d, v22.2d, #33
+        xtn     v25.2s, v29.2d
+        rev64   v31.4s, v1.4s
+        lsl     x13, x14, #32
+        uzp2    v6.4s, v29.4s, v29.4s
+        umlal   v19.2d, v7.2s, v7.2s
+        usra    v3.2d, v5.2d, #32
+        adds    x1, x8, x8
+        umulh   x8, x4, x4
+        add     x12, x13, x14
+        mul     v17.4s, v31.4s, v29.4s
+        xtn     v4.2s, v1.2d
+        adcs    x14, x15, x15
+        lsr     x13, x12, #32
+        adcs    x15, x3, x3
+        umull   v31.2d, v25.2s, v28.2s
+        adcs    x11, x16, x16
+        umull   v21.2d, v25.2s, v4.2s
+        mov     x17, v3.d[0]
+        umull   v18.2d, v6.2s, v28.2s
+        adc     x16, x8, xzr
+        uaddlp  v16.2d, v17.4s
+        movi    v1.2d, #0xffffffff
+        subs    x13, x13, x12
+        usra    v31.2d, v21.2d, #32
+        sbc     x8, x12, xzr
+        adds    x17, x17, x1
+        mul     x1, x4, x4
+        shl     v28.2d, v16.2d, #32
+        mov     x3, v3.d[1]
+        adcs    x14, x7, x14
+        extr    x7, x8, x13, #32
+        adcs    x13, x3, x15
+        and     v3.16b, v31.16b, v1.16b
+        adcs    x11, x1, x11
+        lsr     x1, x8, #32
+        umlal   v3.2d, v6.2s, v4.2s
+        usra    v18.2d, v31.2d, #32
+        adc     x3, x16, xzr
+        adds    x1, x1, x12
+        umlal   v28.2d, v25.2s, v4.2s
+        adc     x16, xzr, xzr
+        subs    x15, x17, x7
+        sbcs    x7, x14, x1
+        lsl     x1, x15, #32
+        sbcs    x16, x13, x16
+        add     x8, x1, x15
+        usra    v18.2d, v3.2d, #32
+        sbcs    x14, x11, xzr
+        lsr     x1, x8, #32
+        sbcs    x17, x3, xzr
+        sbc     x11, x12, xzr
+        subs    x13, x1, x8
+        umulh   x12, x4, x10
+        sbc     x1, x8, xzr
+        extr    x13, x1, x13, #32
+        lsr     x1, x1, #32
+        adds    x15, x1, x8
+        adc     x1, xzr, xzr
+        subs    x7, x7, x13
+        sbcs    x13, x16, x15
+        lsl     x3, x7, #32
+        umulh   x16, x2, x5
+        sbcs    x15, x14, x1
+        add     x7, x3, x7
+        sbcs    x3, x17, xzr
+        lsr     x1, x7, #32
+        sbcs    x14, x11, xzr
+        sbc     x11, x8, xzr
+        subs    x8, x1, x7
+        sbc     x1, x7, xzr
+        extr    x8, x1, x8, #32
+        lsr     x1, x1, #32
+        adds    x1, x1, x7
+        adc     x17, xzr, xzr
+        subs    x13, x13, x8
+        umulh   x8, x9, x6
+        sbcs    x1, x15, x1
+        sbcs    x15, x3, x17
+        sbcs    x3, x14, xzr
+        mul     x17, x2, x5
+        sbcs    x11, x11, xzr
+        stp     x13, x1, [x0]
+        sbc     x14, x7, xzr
+        mul     x7, x4, x10
+        subs    x1, x9, x2
+        stp     x15, x3, [x0, #16]
+        csetm   x15, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        stp     x11, x14, [x0, #32]
+        mul     x14, x9, x6
+        adds    x17, x8, x17
+        adcs    x7, x16, x7
+        adc     x13, x12, xzr
+        subs    x12, x5, x6
+        cneg    x3, x12, cc  // cc = lo, ul, last
+        cinv    x16, x15, cc  // cc = lo, ul, last
+        mul     x8, x1, x3
+        umulh   x1, x1, x3
+        eor     x12, x8, x16
+        adds    x11, x17, x14
+        adcs    x3, x7, x17
+        adcs    x15, x13, x7
+        adc     x8, x13, xzr
+        adds    x3, x3, x14
+        adcs    x15, x15, x17
+        adcs    x17, x8, x7
+        eor     x1, x1, x16
+        adc     x13, x13, xzr
+        subs    x9, x9, x4
+        csetm   x8, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x4, x2, x4
+        cneg    x4, x4, cc  // cc = lo, ul, last
+        csetm   x7, cc  // cc = lo, ul, last
+        subs    x2, x10, x6
+        cinv    x8, x8, cc  // cc = lo, ul, last
+        cneg    x2, x2, cc  // cc = lo, ul, last
+        cmn     x16, #0x1
+        adcs    x11, x11, x12
+        mul     x12, x9, x2
+        adcs    x3, x3, x1
+        adcs    x15, x15, x16
+        umulh   x9, x9, x2
+        adcs    x17, x17, x16
+        adc     x13, x13, x16
+        subs    x1, x10, x5
+        cinv    x2, x7, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        eor     x9, x9, x8
+        cmn     x8, #0x1
+        eor     x7, x12, x8
+        mul     x12, x4, x1
+        adcs    x3, x3, x7
+        adcs    x7, x15, x9
+        adcs    x15, x17, x8
+        ldp     x9, x17, [x0, #16]
+        umulh   x4, x4, x1
+        adc     x8, x13, x8
+        cmn     x2, #0x1
+        eor     x1, x12, x2
+        adcs    x1, x7, x1
+        ldp     x7, x16, [x0]
+        eor     x12, x4, x2
+        adcs    x4, x15, x12
+        ldp     x15, x12, [x0, #32]
+        adc     x8, x8, x2
+        adds    x13, x14, x14
+        umulh   x14, x5, x10
+        adcs    x2, x11, x11
+        adcs    x3, x3, x3
+        adcs    x1, x1, x1
+        adcs    x4, x4, x4
+        adcs    x11, x8, x8
+        adc     x8, xzr, xzr
+        adds    x13, x13, x7
+        adcs    x2, x2, x16
+        mul     x16, x5, x10
+        adcs    x3, x3, x9
+        adcs    x1, x1, x17
+        umulh   x5, x5, x5
+        lsl     x9, x13, #32
+        add     x9, x9, x13
+        adcs    x4, x4, x15
+        mov     x13, v28.d[1]
+        adcs    x15, x11, x12
+        lsr     x7, x9, #32
+        adc     x11, x8, xzr
+        subs    x7, x7, x9
+        umulh   x10, x10, x10
+        sbc     x17, x9, xzr
+        extr    x7, x17, x7, #32
+        lsr     x17, x17, #32
+        adds    x17, x17, x9
+        adc     x12, xzr, xzr
+        subs    x8, x2, x7
+        sbcs    x17, x3, x17
+        lsl     x7, x8, #32
+        sbcs    x2, x1, x12
+        add     x3, x7, x8
+        sbcs    x12, x4, xzr
+        lsr     x1, x3, #32
+        sbcs    x7, x15, xzr
+        sbc     x15, x9, xzr
+        subs    x1, x1, x3
+        sbc     x4, x3, xzr
+        lsr     x9, x4, #32
+        extr    x8, x4, x1, #32
+        adds    x9, x9, x3
+        adc     x4, xzr, xzr
+        subs    x1, x17, x8
+        lsl     x17, x1, #32
+        sbcs    x8, x2, x9
+        sbcs    x9, x12, x4
+        add     x17, x17, x1
+        mov     x1, v18.d[1]
+        lsr     x2, x17, #32
+        sbcs    x7, x7, xzr
+        mov     x12, v18.d[0]
+        sbcs    x15, x15, xzr
+        sbc     x3, x3, xzr
+        subs    x4, x2, x17
+        sbc     x2, x17, xzr
+        adds    x12, x13, x12
+        adcs    x16, x16, x1
+        lsr     x13, x2, #32
+        extr    x1, x2, x4, #32
+        adc     x2, x14, xzr
+        adds    x4, x13, x17
+        mul     x13, x6, x6
+        adc     x14, xzr, xzr
+        subs    x1, x8, x1
+        sbcs    x4, x9, x4
+        mov     x9, v28.d[0]
+        sbcs    x7, x7, x14
+        sbcs    x8, x15, xzr
+        sbcs    x3, x3, xzr
+        sbc     x14, x17, xzr
+        adds    x17, x9, x9
+        adcs    x12, x12, x12
+        mov     x15, v19.d[0]
+        adcs    x9, x16, x16
+        umulh   x6, x6, x6
+        adcs    x16, x2, x2
+        adc     x2, xzr, xzr
+        adds    x11, x11, x8
+        adcs    x3, x3, xzr
+        adcs    x14, x14, xzr
+        adcs    x8, xzr, xzr
+        adds    x13, x1, x13
+        mov     x1, v19.d[1]
+        adcs    x6, x4, x6
+        mov     x4, #0xffffffff                 // #4294967295
+        adcs    x15, x7, x15
+        adcs    x7, x11, x5
+        adcs    x1, x3, x1
+        adcs    x14, x14, x10
+        adc     x11, x8, xzr
+        adds    x6, x6, x17
+        adcs    x8, x15, x12
+        adcs    x3, x7, x9
+        adcs    x15, x1, x16
+        mov     x16, #0xffffffff00000001        // #-4294967295
+        adcs    x14, x14, x2
+        mov     x2, #0x1                        // #1
+        adc     x17, x11, xzr
+        cmn     x13, x16
+        adcs    xzr, x6, x4
+        adcs    xzr, x8, x2
+        adcs    xzr, x3, xzr
+        adcs    xzr, x15, xzr
+        adcs    xzr, x14, xzr
+        adc     x1, x17, xzr
+        neg     x9, x1
+        and     x1, x16, x9
+        adds    x11, x13, x1
+        and     x13, x4, x9
+        adcs    x5, x6, x13
+        and     x1, x2, x9
+        adcs    x7, x8, x1
+        stp     x11, x5, [x0]
+        adcs    x11, x3, xzr
+        adcs    x2, x15, xzr
+        stp     x7, x11, [x0, #16]
+        adc     x17, x14, xzr
+        stp     x2, x17, [x0, #32]
+        ldr     q3, [sp, #144]
+        ldr     q25, [sp, #192]
+        ldp     x13, x23, [sp, #192]
+        ldp     x3, x21, [sp, #144]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [sp, #224]
+        ldp     x8, x24, [sp, #160]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #176]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [sp, #208]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [sp, #224]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #176]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #192]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #208]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #224]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #192]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #208]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #224]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #192]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #208]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #224]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #192]
+        ldp     x21, x12, [sp, #208]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #224]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #192]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #208]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #224]
+        ldr     q3, [sp, #144]
+        ldr     q25, [sp, #96]
+        ldp     x13, x23, [sp, #96]
+        ldp     x3, x21, [sp, #144]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [sp, #128]
+        ldp     x8, x24, [sp, #160]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #176]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [sp, #112]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [sp, #128]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #176]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #96]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #112]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #128]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #96]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #112]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #128]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #96]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #112]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #128]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #96]
+        ldp     x21, x12, [sp, #112]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #128]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x2, x24, x11
+        stp     x22, x5, [sp, #96]
+        adcs    x11, x13, x23
+        adcs    x12, x8, x23
+        stp     x2, x11, [sp, #112]
+        adc     x13, x15, x23
+        stp     x12, x13, [sp, #128]
+        mov     x0, sp
+        mov     x1, sp
+        ldp     x5, x6, [x1]
+        ldp     x4, x3, [sp, #192]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x1, #16]
+        ldp     x4, x3, [sp, #208]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        ldp     x9, x10, [x1, #32]
+        ldp     x4, x3, [sp, #224]
+        sbcs    x9, x9, x4
+        sbcs    x10, x10, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x5, x5, x4
+        eor     x4, x4, x3
+        adcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x5, x6, [x0]
+        stp     x7, x8, [x0, #16]
+        stp     x9, x10, [x0, #32]
+        ldp     x5, x6, [sp, #96]
+        ldp     x4, x3, [sp, #192]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x4, x3, [sp, #208]
+        sbcs    x7, x2, x4
+        sbcs    x8, x11, x3
+        ldp     x4, x3, [sp, #224]
+        sbcs    x9, x12, x4
+        sbcs    x10, x13, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x5, x5, x4
+        eor     x4, x4, x3
+        adcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x5, x6, [sp, #144]
+        stp     x7, x8, [sp, #160]
+        stp     x9, x10, [sp, #176]
+        ldr     q3, [sp, #240]
+        ldr     q25, [x25, #96]
+        ldp     x13, x23, [x25, #96]
+        ldp     x3, x21, [sp, #240]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [x25, #128]
+        ldp     x8, x24, [sp, #256]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #272]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [x25, #112]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [x25, #128]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #272]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #240]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #256]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #272]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #240]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #256]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #272]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #240]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #256]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #272]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #240]
+        ldp     x21, x12, [sp, #256]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #272]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #240]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #256]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #272]
+        mov     x0, sp
+        mov     x1, sp
+        ldp     x5, x6, [x1]
+        ldp     x4, x3, [sp, #96]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x1, #16]
+        ldp     x4, x3, [sp, #112]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        ldp     x9, x10, [x1, #32]
+        ldp     x4, x3, [sp, #128]
+        sbcs    x9, x9, x4
+        sbcs    x10, x10, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x2, x5, x4
+        eor     x4, x4, x3
+        adcs    x11, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x4, x7, x4
+        adcs    x12, x8, x3
+        adcs    x13, x9, x3
+        adc     x3, x10, x3
+        stp     x2, x11, [x0]
+        stp     x4, x12, [x0, #16]
+        stp     x13, x3, [x0, #32]
+        ldp     x5, x6, [sp, #192]
+        subs    x5, x5, x2
+        sbcs    x6, x6, x11
+        ldp     x7, x8, [sp, #208]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x12
+        ldp     x9, x10, [sp, #224]
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x5, x5, x4
+        eor     x4, x4, x3
+        adcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x5, x6, [sp, #192]
+        stp     x7, x8, [sp, #208]
+        stp     x9, x10, [sp, #224]
+        ldr     q3, [sp, #144]
+        ldr     q25, [sp, #288]
+        ldp     x13, x23, [sp, #288]
+        ldp     x3, x21, [sp, #144]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [sp, #320]
+        ldp     x8, x24, [sp, #160]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #176]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [sp, #304]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [sp, #320]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #176]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #144]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #160]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #176]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #144]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #160]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #176]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #144]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #160]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #176]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #144]
+        ldp     x21, x12, [sp, #160]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #176]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #144]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #160]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #176]
+        ldr     q3, [sp, #240]
+        ldr     q25, [x26, #96]
+        ldp     x13, x23, [x26, #96]
+        ldp     x3, x21, [sp, #240]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [x26, #128]
+        ldp     x8, x24, [sp, #256]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #272]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [x26, #112]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [x26, #128]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #272]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #240]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #256]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #272]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #240]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #256]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #272]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #240]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #256]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #272]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #240]
+        ldp     x21, x12, [sp, #256]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #272]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #240]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #256]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #272]
+        ldp     x2, x27, [sp, #0x150]
+        ldr     q3, [sp, #48]
+        ldr     q25, [sp, #192]
+        ldp     x13, x23, [sp, #192]
+        ldp     x3, x21, [sp, #48]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [sp, #224]
+        ldp     x8, x24, [sp, #64]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #80]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [sp, #208]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [sp, #224]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #80]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #192]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #208]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #224]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #192]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #208]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #224]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #192]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #208]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #224]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #192]
+        ldp     x21, x12, [sp, #208]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #224]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x2, x6, x20
+        eor     x3, x20, x23
+        adcs    x6, x7, x3
+        adcs    x7, x24, x11
+        adcs    x9, x13, x23
+        adcs    x10, x8, x23
+        adc     x11, x15, x23
+        ldp     x4, x3, [sp, #144]
+        subs    x5, x2, x4
+        sbcs    x6, x6, x3
+        ldp     x4, x3, [sp, #160]
+        sbcs    x7, x7, x4
+        sbcs    x8, x9, x3
+        ldp     x4, x3, [sp, #176]
+        sbcs    x9, x10, x4
+        sbcs    x10, x11, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x19, x5, x4
+        eor     x4, x4, x3
+        adcs    x24, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x7, x8, [sp, #208]
+        stp     x9, x10, [sp, #224]
+        ldp     x0, x1, [x25, #96]
+        ldp     x2, x3, [x25, #112]
+        ldp     x4, x5, [x25, #128]
+        orr     x20, x0, x1
+        orr     x21, x2, x3
+        orr     x22, x4, x5
+        orr     x20, x20, x21
+        orr     x20, x20, x22
+        cmp     x20, xzr
+        cset    x20, ne  // ne = any
+        ldp     x6, x7, [x26, #96]
+        ldp     x8, x9, [x26, #112]
+        ldp     x10, x11, [x26, #128]
+        orr     x21, x6, x7
+        orr     x22, x8, x9
+        orr     x23, x10, x11
+        orr     x21, x21, x22
+        orr     x21, x21, x23
+        cmp     x21, xzr
+        cset    x21, ne  // ne = any
+        cmp     x21, x20
+        ldp     x12, x13, [sp, #240]
+        csel    x12, x0, x12, cc  // cc = lo, ul, last
+        csel    x13, x1, x13, cc  // cc = lo, ul, last
+        csel    x12, x6, x12, hi  // hi = pmore
+        csel    x13, x7, x13, hi  // hi = pmore
+        ldp     x14, x15, [sp, #256]
+        csel    x14, x2, x14, cc  // cc = lo, ul, last
+        csel    x15, x3, x15, cc  // cc = lo, ul, last
+        csel    x14, x8, x14, hi  // hi = pmore
+        csel    x15, x9, x15, hi  // hi = pmore
+        ldp     x16, x17, [sp, #272]
+        csel    x16, x4, x16, cc  // cc = lo, ul, last
+        csel    x17, x5, x17, cc  // cc = lo, ul, last
+        csel    x16, x10, x16, hi  // hi = pmore
+        csel    x17, x11, x17, hi  // hi = pmore
+        ldp     x20, x21, [x25]
+        ldp     x0, x1, [sp]
+        csel    x0, x20, x0, cc  // cc = lo, ul, last
+        csel    x1, x21, x1, cc  // cc = lo, ul, last
+        ldp     x20, x21, [x26]
+        csel    x0, x20, x0, hi  // hi = pmore
+        csel    x1, x21, x1, hi  // hi = pmore
+        ldp     x20, x21, [x25, #16]
+        ldp     x2, x3, [sp, #16]
+        csel    x2, x20, x2, cc  // cc = lo, ul, last
+        csel    x3, x21, x3, cc  // cc = lo, ul, last
+        ldp     x20, x21, [x26, #16]
+        csel    x2, x20, x2, hi  // hi = pmore
+        csel    x3, x21, x3, hi  // hi = pmore
+        ldp     x20, x21, [x25, #32]
+        ldp     x4, x5, [sp, #32]
+        csel    x4, x20, x4, cc  // cc = lo, ul, last
+        csel    x5, x21, x5, cc  // cc = lo, ul, last
+        ldp     x20, x21, [x26, #32]
+        csel    x4, x20, x4, hi  // hi = pmore
+        csel    x5, x21, x5, hi  // hi = pmore
+        ldp     x20, x21, [x25, #48]
+        csel    x6, x20, x19, cc  // cc = lo, ul, last
+        csel    x7, x21, x24, cc  // cc = lo, ul, last
+        ldp     x20, x21, [x26, #48]
+        csel    x6, x20, x6, hi  // hi = pmore
+        csel    x7, x21, x7, hi  // hi = pmore
+        ldp     x20, x21, [x25, #64]
+        ldp     x8, x9, [sp, #208]
+        csel    x8, x20, x8, cc  // cc = lo, ul, last
+        csel    x9, x21, x9, cc  // cc = lo, ul, last
+        ldp     x20, x21, [x26, #64]
+        csel    x8, x20, x8, hi  // hi = pmore
+        csel    x9, x21, x9, hi  // hi = pmore
+        ldp     x20, x21, [x25, #80]
+        ldp     x10, x11, [sp, #224]
+        csel    x10, x20, x10, cc  // cc = lo, ul, last
+        csel    x11, x21, x11, cc  // cc = lo, ul, last
+        ldp     x20, x21, [x26, #80]
+        csel    x10, x20, x10, hi  // hi = pmore
+        csel    x11, x21, x11, hi  // hi = pmore
+        stp     x0, x1, [x27]
+        stp     x2, x3, [x27, #16]
+        stp     x4, x5, [x27, #32]
+        stp     x6, x7, [x27, #48]
+        stp     x8, x9, [x27, #64]
+        stp     x10, x11, [x27, #80]
+        stp     x12, x13, [x27, #96]
+        stp     x14, x15, [x27, #112]
+        stp     x16, x17, [x27, #128]
+        add     sp, sp, #0x180
+        ldp     x27, xzr, [sp], #16
+        ldp     x25, x26, [sp], #16
+        ldp     x23, x24, [sp], #16
+        ldp     x21, x22, [sp], #16
+        ldp     x19, x20, [sp], #16
+        ret
+
+p384_montjscalarmul_p384_montjdouble:
+        sub     sp, sp, #0x1a0
+        stp     x19, x20, [sp, #336]
+        stp     x21, x22, [sp, #352]
+        stp     x23, x24, [sp, #368]
+        stp     x25, x26, [sp, #384]
+        stp     x27, xzr, [sp, #400]
+        mov     x25, x0
+        mov     x26, x1
+        mov     x0, sp
+        ldr     q1, [x26, #96]
+        ldp     x9, x2, [x26, #96]
+        ldr     q0, [x26, #96]
+        ldp     x4, x6, [x26, #112]
+        rev64   v21.4s, v1.4s
+        uzp2    v28.4s, v1.4s, v1.4s
+        umulh   x7, x9, x2
+        xtn     v17.2s, v1.2d
+        mul     v27.4s, v21.4s, v0.4s
+        ldr     q20, [x26, #128]
+        xtn     v30.2s, v0.2d
+        ldr     q1, [x26, #128]
+        uzp2    v31.4s, v0.4s, v0.4s
+        ldp     x5, x10, [x26, #128]
+        umulh   x8, x9, x4
+        uaddlp  v3.2d, v27.4s
+        umull   v16.2d, v30.2s, v17.2s
+        mul     x16, x9, x4
+        umull   v27.2d, v30.2s, v28.2s
+        shrn    v0.2s, v20.2d, #32
+        xtn     v7.2s, v20.2d
+        shl     v20.2d, v3.2d, #32
+        umull   v3.2d, v31.2s, v28.2s
+        mul     x3, x2, x4
+        umlal   v20.2d, v30.2s, v17.2s
+        umull   v22.2d, v7.2s, v0.2s
+        usra    v27.2d, v16.2d, #32
+        umulh   x11, x2, x4
+        movi    v21.2d, #0xffffffff
+        uzp2    v28.4s, v1.4s, v1.4s
+        adds    x15, x16, x7
+        and     v5.16b, v27.16b, v21.16b
+        adcs    x3, x3, x8
+        usra    v3.2d, v27.2d, #32
+        dup     v29.2d, x6
+        adcs    x16, x11, xzr
+        mov     x14, v20.d[0]
+        umlal   v5.2d, v31.2s, v17.2s
+        mul     x8, x9, x2
+        mov     x7, v20.d[1]
+        shl     v19.2d, v22.2d, #33
+        xtn     v25.2s, v29.2d
+        rev64   v31.4s, v1.4s
+        lsl     x13, x14, #32
+        uzp2    v6.4s, v29.4s, v29.4s
+        umlal   v19.2d, v7.2s, v7.2s
+        usra    v3.2d, v5.2d, #32
+        adds    x1, x8, x8
+        umulh   x8, x4, x4
+        add     x12, x13, x14
+        mul     v17.4s, v31.4s, v29.4s
+        xtn     v4.2s, v1.2d
+        adcs    x14, x15, x15
+        lsr     x13, x12, #32
+        adcs    x15, x3, x3
+        umull   v31.2d, v25.2s, v28.2s
+        adcs    x11, x16, x16
+        umull   v21.2d, v25.2s, v4.2s
+        mov     x17, v3.d[0]
+        umull   v18.2d, v6.2s, v28.2s
+        adc     x16, x8, xzr
+        uaddlp  v16.2d, v17.4s
+        movi    v1.2d, #0xffffffff
+        subs    x13, x13, x12
+        usra    v31.2d, v21.2d, #32
+        sbc     x8, x12, xzr
+        adds    x17, x17, x1
+        mul     x1, x4, x4
+        shl     v28.2d, v16.2d, #32
+        mov     x3, v3.d[1]
+        adcs    x14, x7, x14
+        extr    x7, x8, x13, #32
+        adcs    x13, x3, x15
+        and     v3.16b, v31.16b, v1.16b
+        adcs    x11, x1, x11
+        lsr     x1, x8, #32
+        umlal   v3.2d, v6.2s, v4.2s
+        usra    v18.2d, v31.2d, #32
+        adc     x3, x16, xzr
+        adds    x1, x1, x12
+        umlal   v28.2d, v25.2s, v4.2s
+        adc     x16, xzr, xzr
+        subs    x15, x17, x7
+        sbcs    x7, x14, x1
+        lsl     x1, x15, #32
+        sbcs    x16, x13, x16
+        add     x8, x1, x15
+        usra    v18.2d, v3.2d, #32
+        sbcs    x14, x11, xzr
+        lsr     x1, x8, #32
+        sbcs    x17, x3, xzr
+        sbc     x11, x12, xzr
+        subs    x13, x1, x8
+        umulh   x12, x4, x10
+        sbc     x1, x8, xzr
+        extr    x13, x1, x13, #32
+        lsr     x1, x1, #32
+        adds    x15, x1, x8
+        adc     x1, xzr, xzr
+        subs    x7, x7, x13
+        sbcs    x13, x16, x15
+        lsl     x3, x7, #32
+        umulh   x16, x2, x5
+        sbcs    x15, x14, x1
+        add     x7, x3, x7
+        sbcs    x3, x17, xzr
+        lsr     x1, x7, #32
+        sbcs    x14, x11, xzr
+        sbc     x11, x8, xzr
+        subs    x8, x1, x7
+        sbc     x1, x7, xzr
+        extr    x8, x1, x8, #32
+        lsr     x1, x1, #32
+        adds    x1, x1, x7
+        adc     x17, xzr, xzr
+        subs    x13, x13, x8
+        umulh   x8, x9, x6
+        sbcs    x1, x15, x1
+        sbcs    x15, x3, x17
+        sbcs    x3, x14, xzr
+        mul     x17, x2, x5
+        sbcs    x11, x11, xzr
+        stp     x13, x1, [x0]
+        sbc     x14, x7, xzr
+        mul     x7, x4, x10
+        subs    x1, x9, x2
+        stp     x15, x3, [x0, #16]
+        csetm   x15, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        stp     x11, x14, [x0, #32]
+        mul     x14, x9, x6
+        adds    x17, x8, x17
+        adcs    x7, x16, x7
+        adc     x13, x12, xzr
+        subs    x12, x5, x6
+        cneg    x3, x12, cc  // cc = lo, ul, last
+        cinv    x16, x15, cc  // cc = lo, ul, last
+        mul     x8, x1, x3
+        umulh   x1, x1, x3
+        eor     x12, x8, x16
+        adds    x11, x17, x14
+        adcs    x3, x7, x17
+        adcs    x15, x13, x7
+        adc     x8, x13, xzr
+        adds    x3, x3, x14
+        adcs    x15, x15, x17
+        adcs    x17, x8, x7
+        eor     x1, x1, x16
+        adc     x13, x13, xzr
+        subs    x9, x9, x4
+        csetm   x8, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x4, x2, x4
+        cneg    x4, x4, cc  // cc = lo, ul, last
+        csetm   x7, cc  // cc = lo, ul, last
+        subs    x2, x10, x6
+        cinv    x8, x8, cc  // cc = lo, ul, last
+        cneg    x2, x2, cc  // cc = lo, ul, last
+        cmn     x16, #0x1
+        adcs    x11, x11, x12
+        mul     x12, x9, x2
+        adcs    x3, x3, x1
+        adcs    x15, x15, x16
+        umulh   x9, x9, x2
+        adcs    x17, x17, x16
+        adc     x13, x13, x16
+        subs    x1, x10, x5
+        cinv    x2, x7, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        eor     x9, x9, x8
+        cmn     x8, #0x1
+        eor     x7, x12, x8
+        mul     x12, x4, x1
+        adcs    x3, x3, x7
+        adcs    x7, x15, x9
+        adcs    x15, x17, x8
+        ldp     x9, x17, [x0, #16]
+        umulh   x4, x4, x1
+        adc     x8, x13, x8
+        cmn     x2, #0x1
+        eor     x1, x12, x2
+        adcs    x1, x7, x1
+        ldp     x7, x16, [x0]
+        eor     x12, x4, x2
+        adcs    x4, x15, x12
+        ldp     x15, x12, [x0, #32]
+        adc     x8, x8, x2
+        adds    x13, x14, x14
+        umulh   x14, x5, x10
+        adcs    x2, x11, x11
+        adcs    x3, x3, x3
+        adcs    x1, x1, x1
+        adcs    x4, x4, x4
+        adcs    x11, x8, x8
+        adc     x8, xzr, xzr
+        adds    x13, x13, x7
+        adcs    x2, x2, x16
+        mul     x16, x5, x10
+        adcs    x3, x3, x9
+        adcs    x1, x1, x17
+        umulh   x5, x5, x5
+        lsl     x9, x13, #32
+        add     x9, x9, x13
+        adcs    x4, x4, x15
+        mov     x13, v28.d[1]
+        adcs    x15, x11, x12
+        lsr     x7, x9, #32
+        adc     x11, x8, xzr
+        subs    x7, x7, x9
+        umulh   x10, x10, x10
+        sbc     x17, x9, xzr
+        extr    x7, x17, x7, #32
+        lsr     x17, x17, #32
+        adds    x17, x17, x9
+        adc     x12, xzr, xzr
+        subs    x8, x2, x7
+        sbcs    x17, x3, x17
+        lsl     x7, x8, #32
+        sbcs    x2, x1, x12
+        add     x3, x7, x8
+        sbcs    x12, x4, xzr
+        lsr     x1, x3, #32
+        sbcs    x7, x15, xzr
+        sbc     x15, x9, xzr
+        subs    x1, x1, x3
+        sbc     x4, x3, xzr
+        lsr     x9, x4, #32
+        extr    x8, x4, x1, #32
+        adds    x9, x9, x3
+        adc     x4, xzr, xzr
+        subs    x1, x17, x8
+        lsl     x17, x1, #32
+        sbcs    x8, x2, x9
+        sbcs    x9, x12, x4
+        add     x17, x17, x1
+        mov     x1, v18.d[1]
+        lsr     x2, x17, #32
+        sbcs    x7, x7, xzr
+        mov     x12, v18.d[0]
+        sbcs    x15, x15, xzr
+        sbc     x3, x3, xzr
+        subs    x4, x2, x17
+        sbc     x2, x17, xzr
+        adds    x12, x13, x12
+        adcs    x16, x16, x1
+        lsr     x13, x2, #32
+        extr    x1, x2, x4, #32
+        adc     x2, x14, xzr
+        adds    x4, x13, x17
+        mul     x13, x6, x6
+        adc     x14, xzr, xzr
+        subs    x1, x8, x1
+        sbcs    x4, x9, x4
+        mov     x9, v28.d[0]
+        sbcs    x7, x7, x14
+        sbcs    x8, x15, xzr
+        sbcs    x3, x3, xzr
+        sbc     x14, x17, xzr
+        adds    x17, x9, x9
+        adcs    x12, x12, x12
+        mov     x15, v19.d[0]
+        adcs    x9, x16, x16
+        umulh   x6, x6, x6
+        adcs    x16, x2, x2
+        adc     x2, xzr, xzr
+        adds    x11, x11, x8
+        adcs    x3, x3, xzr
+        adcs    x14, x14, xzr
+        adcs    x8, xzr, xzr
+        adds    x13, x1, x13
+        mov     x1, v19.d[1]
+        adcs    x6, x4, x6
+        mov     x4, #0xffffffff                 // #4294967295
+        adcs    x15, x7, x15
+        adcs    x7, x11, x5
+        adcs    x1, x3, x1
+        adcs    x14, x14, x10
+        adc     x11, x8, xzr
+        adds    x6, x6, x17
+        adcs    x8, x15, x12
+        adcs    x3, x7, x9
+        adcs    x15, x1, x16
+        mov     x16, #0xffffffff00000001        // #-4294967295
+        adcs    x14, x14, x2
+        mov     x2, #0x1                        // #1
+        adc     x17, x11, xzr
+        cmn     x13, x16
+        adcs    xzr, x6, x4
+        adcs    xzr, x8, x2
+        adcs    xzr, x3, xzr
+        adcs    xzr, x15, xzr
+        adcs    xzr, x14, xzr
+        adc     x1, x17, xzr
+        neg     x9, x1
+        and     x1, x16, x9
+        adds    x11, x13, x1
+        and     x13, x4, x9
+        adcs    x5, x6, x13
+        and     x1, x2, x9
+        adcs    x7, x8, x1
+        stp     x11, x5, [x0]
+        adcs    x11, x3, xzr
+        adcs    x2, x15, xzr
+        stp     x7, x11, [x0, #16]
+        adc     x17, x14, xzr
+        stp     x2, x17, [x0, #32]
+        ldr     q1, [x26, #48]
+        ldp     x9, x2, [x26, #48]
+        ldr     q0, [x26, #48]
+        ldp     x4, x6, [x26, #64]
+        rev64   v21.4s, v1.4s
+        uzp2    v28.4s, v1.4s, v1.4s
+        umulh   x7, x9, x2
+        xtn     v17.2s, v1.2d
+        mul     v27.4s, v21.4s, v0.4s
+        ldr     q20, [x26, #80]
+        xtn     v30.2s, v0.2d
+        ldr     q1, [x26, #80]
+        uzp2    v31.4s, v0.4s, v0.4s
+        ldp     x5, x10, [x26, #80]
+        umulh   x8, x9, x4
+        uaddlp  v3.2d, v27.4s
+        umull   v16.2d, v30.2s, v17.2s
+        mul     x16, x9, x4
+        umull   v27.2d, v30.2s, v28.2s
+        shrn    v0.2s, v20.2d, #32
+        xtn     v7.2s, v20.2d
+        shl     v20.2d, v3.2d, #32
+        umull   v3.2d, v31.2s, v28.2s
+        mul     x3, x2, x4
+        umlal   v20.2d, v30.2s, v17.2s
+        umull   v22.2d, v7.2s, v0.2s
+        usra    v27.2d, v16.2d, #32
+        umulh   x11, x2, x4
+        movi    v21.2d, #0xffffffff
+        uzp2    v28.4s, v1.4s, v1.4s
+        adds    x15, x16, x7
+        and     v5.16b, v27.16b, v21.16b
+        adcs    x3, x3, x8
+        usra    v3.2d, v27.2d, #32
+        dup     v29.2d, x6
+        adcs    x16, x11, xzr
+        mov     x14, v20.d[0]
+        umlal   v5.2d, v31.2s, v17.2s
+        mul     x8, x9, x2
+        mov     x7, v20.d[1]
+        shl     v19.2d, v22.2d, #33
+        xtn     v25.2s, v29.2d
+        rev64   v31.4s, v1.4s
+        lsl     x13, x14, #32
+        uzp2    v6.4s, v29.4s, v29.4s
+        umlal   v19.2d, v7.2s, v7.2s
+        usra    v3.2d, v5.2d, #32
+        adds    x1, x8, x8
+        umulh   x8, x4, x4
+        add     x12, x13, x14
+        mul     v17.4s, v31.4s, v29.4s
+        xtn     v4.2s, v1.2d
+        adcs    x14, x15, x15
+        lsr     x13, x12, #32
+        adcs    x15, x3, x3
+        umull   v31.2d, v25.2s, v28.2s
+        adcs    x11, x16, x16
+        umull   v21.2d, v25.2s, v4.2s
+        mov     x17, v3.d[0]
+        umull   v18.2d, v6.2s, v28.2s
+        adc     x16, x8, xzr
+        uaddlp  v16.2d, v17.4s
+        movi    v1.2d, #0xffffffff
+        subs    x13, x13, x12
+        usra    v31.2d, v21.2d, #32
+        sbc     x8, x12, xzr
+        adds    x17, x17, x1
+        mul     x1, x4, x4
+        shl     v28.2d, v16.2d, #32
+        mov     x3, v3.d[1]
+        adcs    x14, x7, x14
+        extr    x7, x8, x13, #32
+        adcs    x13, x3, x15
+        and     v3.16b, v31.16b, v1.16b
+        adcs    x11, x1, x11
+        lsr     x1, x8, #32
+        umlal   v3.2d, v6.2s, v4.2s
+        usra    v18.2d, v31.2d, #32
+        adc     x3, x16, xzr
+        adds    x1, x1, x12
+        umlal   v28.2d, v25.2s, v4.2s
+        adc     x16, xzr, xzr
+        subs    x15, x17, x7
+        sbcs    x7, x14, x1
+        lsl     x1, x15, #32
+        sbcs    x16, x13, x16
+        add     x8, x1, x15
+        usra    v18.2d, v3.2d, #32
+        sbcs    x14, x11, xzr
+        lsr     x1, x8, #32
+        sbcs    x17, x3, xzr
+        sbc     x11, x12, xzr
+        subs    x13, x1, x8
+        umulh   x12, x4, x10
+        sbc     x1, x8, xzr
+        extr    x13, x1, x13, #32
+        lsr     x1, x1, #32
+        adds    x15, x1, x8
+        adc     x1, xzr, xzr
+        subs    x7, x7, x13
+        sbcs    x13, x16, x15
+        lsl     x3, x7, #32
+        umulh   x16, x2, x5
+        sbcs    x15, x14, x1
+        add     x7, x3, x7
+        sbcs    x3, x17, xzr
+        lsr     x1, x7, #32
+        sbcs    x14, x11, xzr
+        sbc     x11, x8, xzr
+        subs    x8, x1, x7
+        sbc     x1, x7, xzr
+        extr    x8, x1, x8, #32
+        lsr     x1, x1, #32
+        adds    x1, x1, x7
+        adc     x17, xzr, xzr
+        subs    x13, x13, x8
+        umulh   x8, x9, x6
+        sbcs    x1, x15, x1
+        sbcs    x15, x3, x17
+        sbcs    x3, x14, xzr
+        mul     x17, x2, x5
+        sbcs    x11, x11, xzr
+        stp     x13, x1, [sp, #48]
+        sbc     x14, x7, xzr
+        mul     x7, x4, x10
+        subs    x1, x9, x2
+        stp     x15, x3, [sp, #64]
+        csetm   x15, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        stp     x11, x14, [sp, #80]
+        mul     x14, x9, x6
+        adds    x17, x8, x17
+        adcs    x7, x16, x7
+        adc     x13, x12, xzr
+        subs    x12, x5, x6
+        cneg    x3, x12, cc  // cc = lo, ul, last
+        cinv    x16, x15, cc  // cc = lo, ul, last
+        mul     x8, x1, x3
+        umulh   x1, x1, x3
+        eor     x12, x8, x16
+        adds    x11, x17, x14
+        adcs    x3, x7, x17
+        adcs    x15, x13, x7
+        adc     x8, x13, xzr
+        adds    x3, x3, x14
+        adcs    x15, x15, x17
+        adcs    x17, x8, x7
+        eor     x1, x1, x16
+        adc     x13, x13, xzr
+        subs    x9, x9, x4
+        csetm   x8, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x4, x2, x4
+        cneg    x4, x4, cc  // cc = lo, ul, last
+        csetm   x7, cc  // cc = lo, ul, last
+        subs    x2, x10, x6
+        cinv    x8, x8, cc  // cc = lo, ul, last
+        cneg    x2, x2, cc  // cc = lo, ul, last
+        cmn     x16, #0x1
+        adcs    x11, x11, x12
+        mul     x12, x9, x2
+        adcs    x3, x3, x1
+        adcs    x15, x15, x16
+        umulh   x9, x9, x2
+        adcs    x17, x17, x16
+        adc     x13, x13, x16
+        subs    x1, x10, x5
+        cinv    x2, x7, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        eor     x9, x9, x8
+        cmn     x8, #0x1
+        eor     x7, x12, x8
+        mul     x12, x4, x1
+        adcs    x3, x3, x7
+        adcs    x7, x15, x9
+        adcs    x15, x17, x8
+        ldp     x9, x17, [sp, #64]
+        umulh   x4, x4, x1
+        adc     x8, x13, x8
+        cmn     x2, #0x1
+        eor     x1, x12, x2
+        adcs    x1, x7, x1
+        ldp     x7, x16, [sp, #48]
+        eor     x12, x4, x2
+        adcs    x4, x15, x12
+        ldp     x15, x12, [sp, #80]
+        adc     x8, x8, x2
+        adds    x13, x14, x14
+        umulh   x14, x5, x10
+        adcs    x2, x11, x11
+        adcs    x3, x3, x3
+        adcs    x1, x1, x1
+        adcs    x4, x4, x4
+        adcs    x11, x8, x8
+        adc     x8, xzr, xzr
+        adds    x13, x13, x7
+        adcs    x2, x2, x16
+        mul     x16, x5, x10
+        adcs    x3, x3, x9
+        adcs    x1, x1, x17
+        umulh   x5, x5, x5
+        lsl     x9, x13, #32
+        add     x9, x9, x13
+        adcs    x4, x4, x15
+        mov     x13, v28.d[1]
+        adcs    x15, x11, x12
+        lsr     x7, x9, #32
+        adc     x11, x8, xzr
+        subs    x7, x7, x9
+        umulh   x10, x10, x10
+        sbc     x17, x9, xzr
+        extr    x7, x17, x7, #32
+        lsr     x17, x17, #32
+        adds    x17, x17, x9
+        adc     x12, xzr, xzr
+        subs    x8, x2, x7
+        sbcs    x17, x3, x17
+        lsl     x7, x8, #32
+        sbcs    x2, x1, x12
+        add     x3, x7, x8
+        sbcs    x12, x4, xzr
+        lsr     x1, x3, #32
+        sbcs    x7, x15, xzr
+        sbc     x15, x9, xzr
+        subs    x1, x1, x3
+        sbc     x4, x3, xzr
+        lsr     x9, x4, #32
+        extr    x8, x4, x1, #32
+        adds    x9, x9, x3
+        adc     x4, xzr, xzr
+        subs    x1, x17, x8
+        lsl     x17, x1, #32
+        sbcs    x8, x2, x9
+        sbcs    x9, x12, x4
+        add     x17, x17, x1
+        mov     x1, v18.d[1]
+        lsr     x2, x17, #32
+        sbcs    x7, x7, xzr
+        mov     x12, v18.d[0]
+        sbcs    x15, x15, xzr
+        sbc     x3, x3, xzr
+        subs    x4, x2, x17
+        sbc     x2, x17, xzr
+        adds    x12, x13, x12
+        adcs    x16, x16, x1
+        lsr     x13, x2, #32
+        extr    x1, x2, x4, #32
+        adc     x2, x14, xzr
+        adds    x4, x13, x17
+        mul     x13, x6, x6
+        adc     x14, xzr, xzr
+        subs    x1, x8, x1
+        sbcs    x4, x9, x4
+        mov     x9, v28.d[0]
+        sbcs    x7, x7, x14
+        sbcs    x8, x15, xzr
+        sbcs    x3, x3, xzr
+        sbc     x14, x17, xzr
+        adds    x17, x9, x9
+        adcs    x12, x12, x12
+        mov     x15, v19.d[0]
+        adcs    x9, x16, x16
+        umulh   x6, x6, x6
+        adcs    x16, x2, x2
+        adc     x2, xzr, xzr
+        adds    x11, x11, x8
+        adcs    x3, x3, xzr
+        adcs    x14, x14, xzr
+        adcs    x8, xzr, xzr
+        adds    x13, x1, x13
+        mov     x1, v19.d[1]
+        adcs    x6, x4, x6
+        mov     x4, #0xffffffff                 // #4294967295
+        adcs    x15, x7, x15
+        adcs    x7, x11, x5
+        adcs    x1, x3, x1
+        adcs    x14, x14, x10
+        adc     x11, x8, xzr
+        adds    x6, x6, x17
+        adcs    x8, x15, x12
+        adcs    x3, x7, x9
+        adcs    x15, x1, x16
+        mov     x16, #0xffffffff00000001        // #-4294967295
+        adcs    x14, x14, x2
+        mov     x2, #0x1                        // #1
+        adc     x17, x11, xzr
+        cmn     x13, x16
+        adcs    xzr, x6, x4
+        adcs    xzr, x8, x2
+        adcs    xzr, x3, xzr
+        adcs    xzr, x15, xzr
+        adcs    xzr, x14, xzr
+        adc     x1, x17, xzr
+        neg     x9, x1
+        and     x1, x16, x9
+        adds    x11, x13, x1
+        and     x13, x4, x9
+        adcs    x5, x6, x13
+        and     x1, x2, x9
+        adcs    x7, x8, x1
+        stp     x11, x5, [sp, #48]
+        adcs    x11, x3, xzr
+        adcs    x2, x15, xzr
+        stp     x7, x11, [sp, #64]
+        adc     x17, x14, xzr
+        stp     x2, x17, [sp, #80]
+        ldp     x5, x6, [x26]
+        ldp     x4, x3, [sp]
+        adds    x5, x5, x4
+        adcs    x6, x6, x3
+        ldp     x7, x8, [x26, #16]
+        ldp     x4, x3, [sp, #16]
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        ldp     x9, x10, [x26, #32]
+        ldp     x4, x3, [sp, #32]
+        adcs    x9, x9, x4
+        adcs    x10, x10, x3
+        csetm   x3, cs  // cs = hs, nlast
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        subs    x5, x5, x4
+        eor     x4, x4, x3
+        sbcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x3
+        sbc     x10, x10, x3
+        stp     x5, x6, [sp, #240]
+        stp     x7, x8, [sp, #256]
+        stp     x9, x10, [sp, #272]
+        mov     x2, sp
+        ldp     x5, x6, [x26]
+        ldp     x4, x3, [x2]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x26, #16]
+        ldp     x4, x3, [x2, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        ldp     x9, x10, [x26, #32]
+        ldp     x4, x3, [x2, #32]
+        sbcs    x9, x9, x4
+        sbcs    x10, x10, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x13, x5, x4
+        eor     x4, x4, x3
+        adcs    x23, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x13, x23, [sp, #192]
+        stp     x7, x8, [sp, #208]
+        stp     x9, x10, [sp, #224]
+        ldr     q3, [sp, #240]
+        ldr     q25, [sp, #192]
+        ldp     x3, x21, [sp, #240]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [sp, #224]
+        ldp     x8, x24, [sp, #256]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #272]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [sp, #208]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [sp, #224]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #272]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x16, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x11, x20, x11
+        sbcs    x20, x9, x12
+        stp     x16, x11, [sp, #96]
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #112]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #128]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        ldp     x20, x9, [sp, #96]
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #112]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #128]
+        adds    x20, x22, x20
+        mul     x10, x13, x14
+        adcs    x11, x11, x9
+        eor     x9, x8, x21
+        adcs    x21, x19, x17
+        stp     x20, x11, [sp, #96]
+        adcs    x12, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        stp     x21, x12, [sp, #112]
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #128]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #96]
+        ldp     x21, x12, [sp, #112]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #128]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x21
+        eor     x1, x22, x9
+        adcs    x24, x23, x12
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x21
+        adcs    x15, x17, x12
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #96]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #112]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #128]
+        ldp     x5, x6, [x26, #48]
+        ldp     x4, x3, [x26, #96]
+        adds    x5, x5, x4
+        adcs    x6, x6, x3
+        ldp     x7, x8, [x26, #64]
+        ldp     x4, x3, [x26, #112]
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        ldp     x9, x10, [x26, #80]
+        ldp     x4, x3, [x26, #128]
+        adcs    x9, x9, x4
+        adcs    x10, x10, x3
+        adc     x3, xzr, xzr
+        mov     x4, #0xffffffff                 // #4294967295
+        cmp     x5, x4
+        mov     x4, #0xffffffff00000000         // #-4294967296
+        sbcs    xzr, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        sbcs    xzr, x7, x4
+        adcs    xzr, x8, xzr
+        adcs    xzr, x9, xzr
+        adcs    xzr, x10, xzr
+        adcs    x3, x3, xzr
+        csetm   x3, ne  // ne = any
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        subs    x5, x5, x4
+        eor     x4, x4, x3
+        sbcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x3
+        sbc     x10, x10, x3
+        stp     x5, x6, [sp, #240]
+        stp     x7, x8, [sp, #256]
+        stp     x9, x10, [sp, #272]
+        ldr     q1, [sp, #96]
+        ldp     x9, x2, [sp, #96]
+        ldr     q0, [sp, #96]
+        ldp     x4, x6, [sp, #112]
+        rev64   v21.4s, v1.4s
+        uzp2    v28.4s, v1.4s, v1.4s
+        umulh   x7, x9, x2
+        xtn     v17.2s, v1.2d
+        mul     v27.4s, v21.4s, v0.4s
+        ldr     q20, [sp, #128]
+        xtn     v30.2s, v0.2d
+        ldr     q1, [sp, #128]
+        uzp2    v31.4s, v0.4s, v0.4s
+        ldp     x5, x10, [sp, #128]
+        umulh   x8, x9, x4
+        uaddlp  v3.2d, v27.4s
+        umull   v16.2d, v30.2s, v17.2s
+        mul     x16, x9, x4
+        umull   v27.2d, v30.2s, v28.2s
+        shrn    v0.2s, v20.2d, #32
+        xtn     v7.2s, v20.2d
+        shl     v20.2d, v3.2d, #32
+        umull   v3.2d, v31.2s, v28.2s
+        mul     x3, x2, x4
+        umlal   v20.2d, v30.2s, v17.2s
+        umull   v22.2d, v7.2s, v0.2s
+        usra    v27.2d, v16.2d, #32
+        umulh   x11, x2, x4
+        movi    v21.2d, #0xffffffff
+        uzp2    v28.4s, v1.4s, v1.4s
+        adds    x15, x16, x7
+        and     v5.16b, v27.16b, v21.16b
+        adcs    x3, x3, x8
+        usra    v3.2d, v27.2d, #32
+        dup     v29.2d, x6
+        adcs    x16, x11, xzr
+        mov     x14, v20.d[0]
+        umlal   v5.2d, v31.2s, v17.2s
+        mul     x8, x9, x2
+        mov     x7, v20.d[1]
+        shl     v19.2d, v22.2d, #33
+        xtn     v25.2s, v29.2d
+        rev64   v31.4s, v1.4s
+        lsl     x13, x14, #32
+        uzp2    v6.4s, v29.4s, v29.4s
+        umlal   v19.2d, v7.2s, v7.2s
+        usra    v3.2d, v5.2d, #32
+        adds    x1, x8, x8
+        umulh   x8, x4, x4
+        add     x12, x13, x14
+        mul     v17.4s, v31.4s, v29.4s
+        xtn     v4.2s, v1.2d
+        adcs    x14, x15, x15
+        lsr     x13, x12, #32
+        adcs    x15, x3, x3
+        umull   v31.2d, v25.2s, v28.2s
+        adcs    x11, x16, x16
+        umull   v21.2d, v25.2s, v4.2s
+        mov     x17, v3.d[0]
+        umull   v18.2d, v6.2s, v28.2s
+        adc     x16, x8, xzr
+        uaddlp  v16.2d, v17.4s
+        movi    v1.2d, #0xffffffff
+        subs    x13, x13, x12
+        usra    v31.2d, v21.2d, #32
+        sbc     x8, x12, xzr
+        adds    x17, x17, x1
+        mul     x1, x4, x4
+        shl     v28.2d, v16.2d, #32
+        mov     x3, v3.d[1]
+        adcs    x14, x7, x14
+        extr    x7, x8, x13, #32
+        adcs    x13, x3, x15
+        and     v3.16b, v31.16b, v1.16b
+        adcs    x11, x1, x11
+        lsr     x1, x8, #32
+        umlal   v3.2d, v6.2s, v4.2s
+        usra    v18.2d, v31.2d, #32
+        adc     x3, x16, xzr
+        adds    x1, x1, x12
+        umlal   v28.2d, v25.2s, v4.2s
+        adc     x16, xzr, xzr
+        subs    x15, x17, x7
+        sbcs    x7, x14, x1
+        lsl     x1, x15, #32
+        sbcs    x16, x13, x16
+        add     x8, x1, x15
+        usra    v18.2d, v3.2d, #32
+        sbcs    x14, x11, xzr
+        lsr     x1, x8, #32
+        sbcs    x17, x3, xzr
+        sbc     x11, x12, xzr
+        subs    x13, x1, x8
+        umulh   x12, x4, x10
+        sbc     x1, x8, xzr
+        extr    x13, x1, x13, #32
+        lsr     x1, x1, #32
+        adds    x15, x1, x8
+        adc     x1, xzr, xzr
+        subs    x7, x7, x13
+        sbcs    x13, x16, x15
+        lsl     x3, x7, #32
+        umulh   x16, x2, x5
+        sbcs    x15, x14, x1
+        add     x7, x3, x7
+        sbcs    x3, x17, xzr
+        lsr     x1, x7, #32
+        sbcs    x14, x11, xzr
+        sbc     x11, x8, xzr
+        subs    x8, x1, x7
+        sbc     x1, x7, xzr
+        extr    x8, x1, x8, #32
+        lsr     x1, x1, #32
+        adds    x1, x1, x7
+        adc     x17, xzr, xzr
+        subs    x13, x13, x8
+        umulh   x8, x9, x6
+        sbcs    x1, x15, x1
+        sbcs    x15, x3, x17
+        sbcs    x3, x14, xzr
+        mul     x17, x2, x5
+        sbcs    x11, x11, xzr
+        stp     x13, x1, [sp, #288]
+        sbc     x14, x7, xzr
+        mul     x7, x4, x10
+        subs    x1, x9, x2
+        stp     x15, x3, [sp, #304]
+        csetm   x15, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        stp     x11, x14, [sp, #320]
+        mul     x14, x9, x6
+        adds    x17, x8, x17
+        adcs    x7, x16, x7
+        adc     x13, x12, xzr
+        subs    x12, x5, x6
+        cneg    x3, x12, cc  // cc = lo, ul, last
+        cinv    x16, x15, cc  // cc = lo, ul, last
+        mul     x8, x1, x3
+        umulh   x1, x1, x3
+        eor     x12, x8, x16
+        adds    x11, x17, x14
+        adcs    x3, x7, x17
+        adcs    x15, x13, x7
+        adc     x8, x13, xzr
+        adds    x3, x3, x14
+        adcs    x15, x15, x17
+        adcs    x17, x8, x7
+        eor     x1, x1, x16
+        adc     x13, x13, xzr
+        subs    x9, x9, x4
+        csetm   x8, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x4, x2, x4
+        cneg    x4, x4, cc  // cc = lo, ul, last
+        csetm   x7, cc  // cc = lo, ul, last
+        subs    x2, x10, x6
+        cinv    x8, x8, cc  // cc = lo, ul, last
+        cneg    x2, x2, cc  // cc = lo, ul, last
+        cmn     x16, #0x1
+        adcs    x11, x11, x12
+        mul     x12, x9, x2
+        adcs    x3, x3, x1
+        adcs    x15, x15, x16
+        umulh   x9, x9, x2
+        adcs    x17, x17, x16
+        adc     x13, x13, x16
+        subs    x1, x10, x5
+        cinv    x2, x7, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        eor     x9, x9, x8
+        cmn     x8, #0x1
+        eor     x7, x12, x8
+        mul     x12, x4, x1
+        adcs    x3, x3, x7
+        adcs    x7, x15, x9
+        adcs    x15, x17, x8
+        ldp     x9, x17, [sp, #304]
+        umulh   x4, x4, x1
+        adc     x8, x13, x8
+        cmn     x2, #0x1
+        eor     x1, x12, x2
+        adcs    x1, x7, x1
+        ldp     x7, x16, [sp, #288]
+        eor     x12, x4, x2
+        adcs    x4, x15, x12
+        ldp     x15, x12, [sp, #320]
+        adc     x8, x8, x2
+        adds    x13, x14, x14
+        umulh   x14, x5, x10
+        adcs    x2, x11, x11
+        adcs    x3, x3, x3
+        adcs    x1, x1, x1
+        adcs    x4, x4, x4
+        adcs    x11, x8, x8
+        adc     x8, xzr, xzr
+        adds    x13, x13, x7
+        adcs    x2, x2, x16
+        mul     x16, x5, x10
+        adcs    x3, x3, x9
+        adcs    x1, x1, x17
+        umulh   x5, x5, x5
+        lsl     x9, x13, #32
+        add     x9, x9, x13
+        adcs    x4, x4, x15
+        mov     x13, v28.d[1]
+        adcs    x15, x11, x12
+        lsr     x7, x9, #32
+        adc     x11, x8, xzr
+        subs    x7, x7, x9
+        umulh   x10, x10, x10
+        sbc     x17, x9, xzr
+        extr    x7, x17, x7, #32
+        lsr     x17, x17, #32
+        adds    x17, x17, x9
+        adc     x12, xzr, xzr
+        subs    x8, x2, x7
+        sbcs    x17, x3, x17
+        lsl     x7, x8, #32
+        sbcs    x2, x1, x12
+        add     x3, x7, x8
+        sbcs    x12, x4, xzr
+        lsr     x1, x3, #32
+        sbcs    x7, x15, xzr
+        sbc     x15, x9, xzr
+        subs    x1, x1, x3
+        sbc     x4, x3, xzr
+        lsr     x9, x4, #32
+        extr    x8, x4, x1, #32
+        adds    x9, x9, x3
+        adc     x4, xzr, xzr
+        subs    x1, x17, x8
+        lsl     x17, x1, #32
+        sbcs    x8, x2, x9
+        sbcs    x9, x12, x4
+        add     x17, x17, x1
+        mov     x1, v18.d[1]
+        lsr     x2, x17, #32
+        sbcs    x7, x7, xzr
+        mov     x12, v18.d[0]
+        sbcs    x15, x15, xzr
+        sbc     x3, x3, xzr
+        subs    x4, x2, x17
+        sbc     x2, x17, xzr
+        adds    x12, x13, x12
+        adcs    x16, x16, x1
+        lsr     x13, x2, #32
+        extr    x1, x2, x4, #32
+        adc     x2, x14, xzr
+        adds    x4, x13, x17
+        mul     x13, x6, x6
+        adc     x14, xzr, xzr
+        subs    x1, x8, x1
+        sbcs    x4, x9, x4
+        mov     x9, v28.d[0]
+        sbcs    x7, x7, x14
+        sbcs    x8, x15, xzr
+        sbcs    x3, x3, xzr
+        sbc     x14, x17, xzr
+        adds    x17, x9, x9
+        adcs    x12, x12, x12
+        mov     x15, v19.d[0]
+        adcs    x9, x16, x16
+        umulh   x6, x6, x6
+        adcs    x16, x2, x2
+        adc     x2, xzr, xzr
+        adds    x11, x11, x8
+        adcs    x3, x3, xzr
+        adcs    x14, x14, xzr
+        adcs    x8, xzr, xzr
+        adds    x13, x1, x13
+        mov     x1, v19.d[1]
+        adcs    x6, x4, x6
+        mov     x4, #0xffffffff                 // #4294967295
+        adcs    x15, x7, x15
+        adcs    x7, x11, x5
+        adcs    x1, x3, x1
+        adcs    x14, x14, x10
+        adc     x11, x8, xzr
+        adds    x6, x6, x17
+        adcs    x8, x15, x12
+        adcs    x3, x7, x9
+        adcs    x15, x1, x16
+        mov     x16, #0xffffffff00000001        // #-4294967295
+        adcs    x14, x14, x2
+        mov     x2, #0x1                        // #1
+        adc     x17, x11, xzr
+        cmn     x13, x16
+        adcs    xzr, x6, x4
+        adcs    xzr, x8, x2
+        adcs    xzr, x3, xzr
+        adcs    xzr, x15, xzr
+        adcs    xzr, x14, xzr
+        adc     x1, x17, xzr
+        neg     x9, x1
+        and     x1, x16, x9
+        adds    x11, x13, x1
+        and     x13, x4, x9
+        adcs    x5, x6, x13
+        and     x1, x2, x9
+        adcs    x7, x8, x1
+        stp     x11, x5, [sp, #288]
+        adcs    x11, x3, xzr
+        adcs    x2, x15, xzr
+        stp     x7, x11, [sp, #304]
+        adc     x17, x14, xzr
+        stp     x2, x17, [sp, #320]
+        ldr     q3, [x26]
+        ldr     q25, [sp, #48]
+        ldp     x13, x23, [sp, #48]
+        ldp     x3, x21, [x26]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [sp, #80]
+        ldp     x8, x24, [x26, #16]
+        subs    x6, x3, x21
+        ldr     q0, [x26, #32]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [sp, #64]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [sp, #80]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [x26, #32]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x26, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x27, x20, x11
+        sbcs    x20, x9, x12
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #160]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #176]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #160]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #176]
+        adds    x20, x22, x26
+        mul     x10, x13, x14
+        adcs    x11, x11, x27
+        eor     x9, x8, x21
+        adcs    x26, x19, x17
+        stp     x20, x11, [sp, #144]
+        adcs    x27, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #176]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #144]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #176]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x26
+        eor     x1, x22, x9
+        adcs    x24, x23, x27
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x26
+        adcs    x15, x17, x27
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #144]
+        adcs    x5, x13, x23
+        adcs    x21, x8, x23
+        stp     x14, x5, [sp, #160]
+        adc     x12, x15, x23
+        stp     x21, x12, [sp, #176]
+        ldr     q1, [sp, #240]
+        ldp     x9, x2, [sp, #240]
+        ldr     q0, [sp, #240]
+        ldp     x4, x6, [sp, #256]
+        rev64   v21.4s, v1.4s
+        uzp2    v28.4s, v1.4s, v1.4s
+        umulh   x7, x9, x2
+        xtn     v17.2s, v1.2d
+        mul     v27.4s, v21.4s, v0.4s
+        ldr     q20, [sp, #272]
+        xtn     v30.2s, v0.2d
+        ldr     q1, [sp, #272]
+        uzp2    v31.4s, v0.4s, v0.4s
+        ldp     x5, x10, [sp, #272]
+        umulh   x8, x9, x4
+        uaddlp  v3.2d, v27.4s
+        umull   v16.2d, v30.2s, v17.2s
+        mul     x16, x9, x4
+        umull   v27.2d, v30.2s, v28.2s
+        shrn    v0.2s, v20.2d, #32
+        xtn     v7.2s, v20.2d
+        shl     v20.2d, v3.2d, #32
+        umull   v3.2d, v31.2s, v28.2s
+        mul     x3, x2, x4
+        umlal   v20.2d, v30.2s, v17.2s
+        umull   v22.2d, v7.2s, v0.2s
+        usra    v27.2d, v16.2d, #32
+        umulh   x11, x2, x4
+        movi    v21.2d, #0xffffffff
+        uzp2    v28.4s, v1.4s, v1.4s
+        adds    x15, x16, x7
+        and     v5.16b, v27.16b, v21.16b
+        adcs    x3, x3, x8
+        usra    v3.2d, v27.2d, #32
+        dup     v29.2d, x6
+        adcs    x16, x11, xzr
+        mov     x14, v20.d[0]
+        umlal   v5.2d, v31.2s, v17.2s
+        mul     x8, x9, x2
+        mov     x7, v20.d[1]
+        shl     v19.2d, v22.2d, #33
+        xtn     v25.2s, v29.2d
+        rev64   v31.4s, v1.4s
+        lsl     x13, x14, #32
+        uzp2    v6.4s, v29.4s, v29.4s
+        umlal   v19.2d, v7.2s, v7.2s
+        usra    v3.2d, v5.2d, #32
+        adds    x1, x8, x8
+        umulh   x8, x4, x4
+        add     x12, x13, x14
+        mul     v17.4s, v31.4s, v29.4s
+        xtn     v4.2s, v1.2d
+        adcs    x14, x15, x15
+        lsr     x13, x12, #32
+        adcs    x15, x3, x3
+        umull   v31.2d, v25.2s, v28.2s
+        adcs    x11, x16, x16
+        umull   v21.2d, v25.2s, v4.2s
+        mov     x17, v3.d[0]
+        umull   v18.2d, v6.2s, v28.2s
+        adc     x16, x8, xzr
+        uaddlp  v16.2d, v17.4s
+        movi    v1.2d, #0xffffffff
+        subs    x13, x13, x12
+        usra    v31.2d, v21.2d, #32
+        sbc     x8, x12, xzr
+        adds    x17, x17, x1
+        mul     x1, x4, x4
+        shl     v28.2d, v16.2d, #32
+        mov     x3, v3.d[1]
+        adcs    x14, x7, x14
+        extr    x7, x8, x13, #32
+        adcs    x13, x3, x15
+        and     v3.16b, v31.16b, v1.16b
+        adcs    x11, x1, x11
+        lsr     x1, x8, #32
+        umlal   v3.2d, v6.2s, v4.2s
+        usra    v18.2d, v31.2d, #32
+        adc     x3, x16, xzr
+        adds    x1, x1, x12
+        umlal   v28.2d, v25.2s, v4.2s
+        adc     x16, xzr, xzr
+        subs    x15, x17, x7
+        sbcs    x7, x14, x1
+        lsl     x1, x15, #32
+        sbcs    x16, x13, x16
+        add     x8, x1, x15
+        usra    v18.2d, v3.2d, #32
+        sbcs    x14, x11, xzr
+        lsr     x1, x8, #32
+        sbcs    x17, x3, xzr
+        sbc     x11, x12, xzr
+        subs    x13, x1, x8
+        umulh   x12, x4, x10
+        sbc     x1, x8, xzr
+        extr    x13, x1, x13, #32
+        lsr     x1, x1, #32
+        adds    x15, x1, x8
+        adc     x1, xzr, xzr
+        subs    x7, x7, x13
+        sbcs    x13, x16, x15
+        lsl     x3, x7, #32
+        umulh   x16, x2, x5
+        sbcs    x15, x14, x1
+        add     x7, x3, x7
+        sbcs    x3, x17, xzr
+        lsr     x1, x7, #32
+        sbcs    x14, x11, xzr
+        sbc     x11, x8, xzr
+        subs    x8, x1, x7
+        sbc     x1, x7, xzr
+        extr    x8, x1, x8, #32
+        lsr     x1, x1, #32
+        adds    x1, x1, x7
+        adc     x17, xzr, xzr
+        subs    x13, x13, x8
+        umulh   x8, x9, x6
+        sbcs    x1, x15, x1
+        sbcs    x19, x3, x17
+        sbcs    x20, x14, xzr
+        mul     x17, x2, x5
+        sbcs    x11, x11, xzr
+        stp     x13, x1, [sp, #192]
+        sbc     x14, x7, xzr
+        mul     x7, x4, x10
+        subs    x1, x9, x2
+        csetm   x15, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        stp     x11, x14, [sp, #224]
+        mul     x14, x9, x6
+        adds    x17, x8, x17
+        adcs    x7, x16, x7
+        adc     x13, x12, xzr
+        subs    x12, x5, x6
+        cneg    x3, x12, cc  // cc = lo, ul, last
+        cinv    x16, x15, cc  // cc = lo, ul, last
+        mul     x8, x1, x3
+        umulh   x1, x1, x3
+        eor     x12, x8, x16
+        adds    x11, x17, x14
+        adcs    x3, x7, x17
+        adcs    x15, x13, x7
+        adc     x8, x13, xzr
+        adds    x3, x3, x14
+        adcs    x15, x15, x17
+        adcs    x17, x8, x7
+        eor     x1, x1, x16
+        adc     x13, x13, xzr
+        subs    x9, x9, x4
+        csetm   x8, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x4, x2, x4
+        cneg    x4, x4, cc  // cc = lo, ul, last
+        csetm   x7, cc  // cc = lo, ul, last
+        subs    x2, x10, x6
+        cinv    x8, x8, cc  // cc = lo, ul, last
+        cneg    x2, x2, cc  // cc = lo, ul, last
+        cmn     x16, #0x1
+        adcs    x11, x11, x12
+        mul     x12, x9, x2
+        adcs    x3, x3, x1
+        adcs    x15, x15, x16
+        umulh   x9, x9, x2
+        adcs    x17, x17, x16
+        adc     x13, x13, x16
+        subs    x1, x10, x5
+        cinv    x2, x7, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        eor     x9, x9, x8
+        cmn     x8, #0x1
+        eor     x7, x12, x8
+        mul     x12, x4, x1
+        adcs    x3, x3, x7
+        adcs    x7, x15, x9
+        adcs    x15, x17, x8
+        umulh   x4, x4, x1
+        adc     x8, x13, x8
+        cmn     x2, #0x1
+        eor     x1, x12, x2
+        adcs    x1, x7, x1
+        ldp     x7, x16, [sp, #192]
+        eor     x12, x4, x2
+        adcs    x4, x15, x12
+        ldp     x15, x12, [sp, #224]
+        adc     x8, x8, x2
+        adds    x13, x14, x14
+        umulh   x14, x5, x10
+        adcs    x2, x11, x11
+        adcs    x3, x3, x3
+        adcs    x1, x1, x1
+        adcs    x4, x4, x4
+        adcs    x11, x8, x8
+        adc     x8, xzr, xzr
+        adds    x13, x13, x7
+        adcs    x2, x2, x16
+        mul     x16, x5, x10
+        adcs    x3, x3, x19
+        adcs    x1, x1, x20
+        umulh   x5, x5, x5
+        lsl     x9, x13, #32
+        add     x9, x9, x13
+        adcs    x4, x4, x15
+        mov     x13, v28.d[1]
+        adcs    x15, x11, x12
+        lsr     x7, x9, #32
+        adc     x11, x8, xzr
+        subs    x7, x7, x9
+        umulh   x10, x10, x10
+        sbc     x17, x9, xzr
+        extr    x7, x17, x7, #32
+        lsr     x17, x17, #32
+        adds    x17, x17, x9
+        adc     x12, xzr, xzr
+        subs    x8, x2, x7
+        sbcs    x17, x3, x17
+        lsl     x7, x8, #32
+        sbcs    x2, x1, x12
+        add     x3, x7, x8
+        sbcs    x12, x4, xzr
+        lsr     x1, x3, #32
+        sbcs    x7, x15, xzr
+        sbc     x15, x9, xzr
+        subs    x1, x1, x3
+        sbc     x4, x3, xzr
+        lsr     x9, x4, #32
+        extr    x8, x4, x1, #32
+        adds    x9, x9, x3
+        adc     x4, xzr, xzr
+        subs    x1, x17, x8
+        lsl     x17, x1, #32
+        sbcs    x8, x2, x9
+        sbcs    x9, x12, x4
+        add     x17, x17, x1
+        mov     x1, v18.d[1]
+        lsr     x2, x17, #32
+        sbcs    x7, x7, xzr
+        mov     x12, v18.d[0]
+        sbcs    x15, x15, xzr
+        sbc     x3, x3, xzr
+        subs    x4, x2, x17
+        sbc     x2, x17, xzr
+        adds    x12, x13, x12
+        adcs    x16, x16, x1
+        lsr     x13, x2, #32
+        extr    x1, x2, x4, #32
+        adc     x2, x14, xzr
+        adds    x4, x13, x17
+        mul     x13, x6, x6
+        adc     x14, xzr, xzr
+        subs    x1, x8, x1
+        sbcs    x4, x9, x4
+        mov     x9, v28.d[0]
+        sbcs    x7, x7, x14
+        sbcs    x8, x15, xzr
+        sbcs    x3, x3, xzr
+        sbc     x14, x17, xzr
+        adds    x17, x9, x9
+        adcs    x12, x12, x12
+        mov     x15, v19.d[0]
+        adcs    x9, x16, x16
+        umulh   x6, x6, x6
+        adcs    x16, x2, x2
+        adc     x2, xzr, xzr
+        adds    x11, x11, x8
+        adcs    x3, x3, xzr
+        adcs    x14, x14, xzr
+        adcs    x8, xzr, xzr
+        adds    x13, x1, x13
+        mov     x1, v19.d[1]
+        adcs    x6, x4, x6
+        mov     x4, #0xffffffff                 // #4294967295
+        adcs    x15, x7, x15
+        adcs    x7, x11, x5
+        adcs    x1, x3, x1
+        adcs    x14, x14, x10
+        adc     x11, x8, xzr
+        adds    x6, x6, x17
+        adcs    x8, x15, x12
+        adcs    x3, x7, x9
+        adcs    x15, x1, x16
+        mov     x16, #0xffffffff00000001        // #-4294967295
+        adcs    x14, x14, x2
+        mov     x2, #0x1                        // #1
+        adc     x17, x11, xzr
+        cmn     x13, x16
+        adcs    xzr, x6, x4
+        adcs    xzr, x8, x2
+        adcs    xzr, x3, xzr
+        adcs    xzr, x15, xzr
+        adcs    xzr, x14, xzr
+        adc     x1, x17, xzr
+        neg     x9, x1
+        and     x1, x16, x9
+        adds    x19, x13, x1
+        and     x13, x4, x9
+        adcs    x20, x6, x13
+        and     x1, x2, x9
+        adcs    x7, x8, x1
+        adcs    x11, x3, xzr
+        adcs    x2, x15, xzr
+        stp     x7, x11, [sp, #208]
+        adc     x17, x14, xzr
+        stp     x2, x17, [sp, #224]
+        ldp     x0, x1, [sp, #288]
+        mov     x6, #0xffffffff                 // #4294967295
+        subs    x6, x6, x0
+        mov     x7, #0xffffffff00000000         // #-4294967296
+        sbcs    x7, x7, x1
+        ldp     x0, x1, [sp, #304]
+        mov     x8, #0xfffffffffffffffe         // #-2
+        sbcs    x8, x8, x0
+        mov     x13, #0xffffffffffffffff        // #-1
+        sbcs    x9, x13, x1
+        ldp     x0, x1, [sp, #320]
+        sbcs    x10, x13, x0
+        sbc     x11, x13, x1
+        mov     x12, #0x9                       // #9
+        mul     x0, x12, x6
+        mul     x1, x12, x7
+        mul     x2, x12, x8
+        mul     x3, x12, x9
+        mul     x4, x12, x10
+        mul     x5, x12, x11
+        umulh   x6, x12, x6
+        umulh   x7, x12, x7
+        umulh   x8, x12, x8
+        umulh   x9, x12, x9
+        umulh   x10, x12, x10
+        umulh   x12, x12, x11
+        adds    x1, x1, x6
+        adcs    x2, x2, x7
+        adcs    x3, x3, x8
+        adcs    x4, x4, x9
+        adcs    x5, x5, x10
+        mov     x6, #0x1                        // #1
+        adc     x6, x12, x6
+        ldp     x8, x9, [sp, #144]
+        ldp     x10, x11, [sp, #160]
+        ldp     x12, x13, [sp, #176]
+        mov     x14, #0xc                       // #12
+        mul     x15, x14, x8
+        umulh   x8, x14, x8
+        adds    x0, x0, x15
+        mul     x15, x14, x9
+        umulh   x9, x14, x9
+        adcs    x1, x1, x15
+        mul     x15, x14, x10
+        umulh   x10, x14, x10
+        adcs    x2, x2, x15
+        mul     x15, x14, x11
+        umulh   x11, x14, x11
+        adcs    x3, x3, x15
+        mul     x15, x14, x12
+        umulh   x12, x14, x12
+        adcs    x4, x4, x15
+        mul     x15, x14, x13
+        umulh   x13, x14, x13
+        adcs    x5, x5, x15
+        adc     x6, x6, xzr
+        adds    x1, x1, x8
+        adcs    x2, x2, x9
+        adcs    x3, x3, x10
+        adcs    x4, x4, x11
+        adcs    x5, x5, x12
+        adcs    x6, x6, x13
+        lsl     x7, x6, #32
+        subs    x8, x6, x7
+        sbc     x7, x7, xzr
+        adds    x0, x0, x8
+        adcs    x1, x1, x7
+        adcs    x2, x2, x6
+        adcs    x3, x3, xzr
+        adcs    x4, x4, xzr
+        adcs    x5, x5, xzr
+        csetm   x6, cc  // cc = lo, ul, last
+        mov     x7, #0xffffffff                 // #4294967295
+        and     x7, x7, x6
+        adds    x0, x0, x7
+        eor     x7, x7, x6
+        adcs    x1, x1, x7
+        mov     x7, #0xfffffffffffffffe         // #-2
+        and     x7, x7, x6
+        adcs    x2, x2, x7
+        adcs    x3, x3, x6
+        adcs    x4, x4, x6
+        adc     x5, x5, x6
+        stp     x0, x1, [sp, #288]
+        stp     x2, x3, [sp, #304]
+        stp     x4, x5, [sp, #320]
+        mov     x2, sp
+        ldp     x4, x3, [x2]
+        subs    x5, x19, x4
+        sbcs    x6, x20, x3
+        ldp     x7, x8, [sp, #208]
+        ldp     x4, x3, [x2, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        ldp     x9, x10, [sp, #224]
+        ldp     x4, x3, [x2, #32]
+        sbcs    x9, x9, x4
+        sbcs    x10, x10, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x5, x5, x4
+        eor     x4, x4, x3
+        adcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x5, x6, [sp, #240]
+        stp     x7, x8, [sp, #256]
+        stp     x9, x10, [sp, #272]
+        ldr     q1, [sp, #48]
+        ldp     x9, x2, [sp, #48]
+        ldr     q0, [sp, #48]
+        ldp     x4, x6, [sp, #64]
+        rev64   v21.4s, v1.4s
+        uzp2    v28.4s, v1.4s, v1.4s
+        umulh   x7, x9, x2
+        xtn     v17.2s, v1.2d
+        mul     v27.4s, v21.4s, v0.4s
+        ldr     q20, [sp, #80]
+        xtn     v30.2s, v0.2d
+        ldr     q1, [sp, #80]
+        uzp2    v31.4s, v0.4s, v0.4s
+        ldp     x5, x10, [sp, #80]
+        umulh   x8, x9, x4
+        uaddlp  v3.2d, v27.4s
+        umull   v16.2d, v30.2s, v17.2s
+        mul     x16, x9, x4
+        umull   v27.2d, v30.2s, v28.2s
+        shrn    v0.2s, v20.2d, #32
+        xtn     v7.2s, v20.2d
+        shl     v20.2d, v3.2d, #32
+        umull   v3.2d, v31.2s, v28.2s
+        mul     x3, x2, x4
+        umlal   v20.2d, v30.2s, v17.2s
+        umull   v22.2d, v7.2s, v0.2s
+        usra    v27.2d, v16.2d, #32
+        umulh   x11, x2, x4
+        movi    v21.2d, #0xffffffff
+        uzp2    v28.4s, v1.4s, v1.4s
+        adds    x15, x16, x7
+        and     v5.16b, v27.16b, v21.16b
+        adcs    x3, x3, x8
+        usra    v3.2d, v27.2d, #32
+        dup     v29.2d, x6
+        adcs    x16, x11, xzr
+        mov     x14, v20.d[0]
+        umlal   v5.2d, v31.2s, v17.2s
+        mul     x8, x9, x2
+        mov     x7, v20.d[1]
+        shl     v19.2d, v22.2d, #33
+        xtn     v25.2s, v29.2d
+        rev64   v31.4s, v1.4s
+        lsl     x13, x14, #32
+        uzp2    v6.4s, v29.4s, v29.4s
+        umlal   v19.2d, v7.2s, v7.2s
+        usra    v3.2d, v5.2d, #32
+        adds    x1, x8, x8
+        umulh   x8, x4, x4
+        add     x12, x13, x14
+        mul     v17.4s, v31.4s, v29.4s
+        xtn     v4.2s, v1.2d
+        adcs    x14, x15, x15
+        lsr     x13, x12, #32
+        adcs    x15, x3, x3
+        umull   v31.2d, v25.2s, v28.2s
+        adcs    x11, x16, x16
+        umull   v21.2d, v25.2s, v4.2s
+        mov     x17, v3.d[0]
+        umull   v18.2d, v6.2s, v28.2s
+        adc     x16, x8, xzr
+        uaddlp  v16.2d, v17.4s
+        movi    v1.2d, #0xffffffff
+        subs    x13, x13, x12
+        usra    v31.2d, v21.2d, #32
+        sbc     x8, x12, xzr
+        adds    x17, x17, x1
+        mul     x1, x4, x4
+        shl     v28.2d, v16.2d, #32
+        mov     x3, v3.d[1]
+        adcs    x14, x7, x14
+        extr    x7, x8, x13, #32
+        adcs    x13, x3, x15
+        and     v3.16b, v31.16b, v1.16b
+        adcs    x11, x1, x11
+        lsr     x1, x8, #32
+        umlal   v3.2d, v6.2s, v4.2s
+        usra    v18.2d, v31.2d, #32
+        adc     x3, x16, xzr
+        adds    x1, x1, x12
+        umlal   v28.2d, v25.2s, v4.2s
+        adc     x16, xzr, xzr
+        subs    x15, x17, x7
+        sbcs    x7, x14, x1
+        lsl     x1, x15, #32
+        sbcs    x16, x13, x16
+        add     x8, x1, x15
+        usra    v18.2d, v3.2d, #32
+        sbcs    x14, x11, xzr
+        lsr     x1, x8, #32
+        sbcs    x17, x3, xzr
+        sbc     x11, x12, xzr
+        subs    x13, x1, x8
+        umulh   x12, x4, x10
+        sbc     x1, x8, xzr
+        extr    x13, x1, x13, #32
+        lsr     x1, x1, #32
+        adds    x15, x1, x8
+        adc     x1, xzr, xzr
+        subs    x7, x7, x13
+        sbcs    x13, x16, x15
+        lsl     x3, x7, #32
+        umulh   x16, x2, x5
+        sbcs    x15, x14, x1
+        add     x7, x3, x7
+        sbcs    x3, x17, xzr
+        lsr     x1, x7, #32
+        sbcs    x14, x11, xzr
+        sbc     x11, x8, xzr
+        subs    x8, x1, x7
+        sbc     x1, x7, xzr
+        extr    x8, x1, x8, #32
+        lsr     x1, x1, #32
+        adds    x1, x1, x7
+        adc     x17, xzr, xzr
+        subs    x13, x13, x8
+        umulh   x8, x9, x6
+        sbcs    x1, x15, x1
+        sbcs    x19, x3, x17
+        sbcs    x20, x14, xzr
+        mul     x17, x2, x5
+        sbcs    x11, x11, xzr
+        stp     x13, x1, [sp, #192]
+        sbc     x14, x7, xzr
+        mul     x7, x4, x10
+        subs    x1, x9, x2
+        csetm   x15, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        stp     x11, x14, [sp, #224]
+        mul     x14, x9, x6
+        adds    x17, x8, x17
+        adcs    x7, x16, x7
+        adc     x13, x12, xzr
+        subs    x12, x5, x6
+        cneg    x3, x12, cc  // cc = lo, ul, last
+        cinv    x16, x15, cc  // cc = lo, ul, last
+        mul     x8, x1, x3
+        umulh   x1, x1, x3
+        eor     x12, x8, x16
+        adds    x11, x17, x14
+        adcs    x3, x7, x17
+        adcs    x15, x13, x7
+        adc     x8, x13, xzr
+        adds    x3, x3, x14
+        adcs    x15, x15, x17
+        adcs    x17, x8, x7
+        eor     x1, x1, x16
+        adc     x13, x13, xzr
+        subs    x9, x9, x4
+        csetm   x8, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x4, x2, x4
+        cneg    x4, x4, cc  // cc = lo, ul, last
+        csetm   x7, cc  // cc = lo, ul, last
+        subs    x2, x10, x6
+        cinv    x8, x8, cc  // cc = lo, ul, last
+        cneg    x2, x2, cc  // cc = lo, ul, last
+        cmn     x16, #0x1
+        adcs    x11, x11, x12
+        mul     x12, x9, x2
+        adcs    x3, x3, x1
+        adcs    x15, x15, x16
+        umulh   x9, x9, x2
+        adcs    x17, x17, x16
+        adc     x13, x13, x16
+        subs    x1, x10, x5
+        cinv    x2, x7, cc  // cc = lo, ul, last
+        cneg    x1, x1, cc  // cc = lo, ul, last
+        eor     x9, x9, x8
+        cmn     x8, #0x1
+        eor     x7, x12, x8
+        mul     x12, x4, x1
+        adcs    x3, x3, x7
+        adcs    x7, x15, x9
+        adcs    x15, x17, x8
+        umulh   x4, x4, x1
+        adc     x8, x13, x8
+        cmn     x2, #0x1
+        eor     x1, x12, x2
+        adcs    x1, x7, x1
+        ldp     x7, x16, [sp, #192]
+        eor     x12, x4, x2
+        adcs    x4, x15, x12
+        ldp     x15, x12, [sp, #224]
+        adc     x8, x8, x2
+        adds    x13, x14, x14
+        umulh   x14, x5, x10
+        adcs    x2, x11, x11
+        adcs    x3, x3, x3
+        adcs    x1, x1, x1
+        adcs    x4, x4, x4
+        adcs    x11, x8, x8
+        adc     x8, xzr, xzr
+        adds    x13, x13, x7
+        adcs    x2, x2, x16
+        mul     x16, x5, x10
+        adcs    x3, x3, x19
+        adcs    x1, x1, x20
+        umulh   x5, x5, x5
+        lsl     x9, x13, #32
+        add     x9, x9, x13
+        adcs    x4, x4, x15
+        mov     x13, v28.d[1]
+        adcs    x15, x11, x12
+        lsr     x7, x9, #32
+        adc     x11, x8, xzr
+        subs    x7, x7, x9
+        umulh   x10, x10, x10
+        sbc     x17, x9, xzr
+        extr    x7, x17, x7, #32
+        lsr     x17, x17, #32
+        adds    x17, x17, x9
+        adc     x12, xzr, xzr
+        subs    x8, x2, x7
+        sbcs    x17, x3, x17
+        lsl     x7, x8, #32
+        sbcs    x2, x1, x12
+        add     x3, x7, x8
+        sbcs    x12, x4, xzr
+        lsr     x1, x3, #32
+        sbcs    x7, x15, xzr
+        sbc     x15, x9, xzr
+        subs    x1, x1, x3
+        sbc     x4, x3, xzr
+        lsr     x9, x4, #32
+        extr    x8, x4, x1, #32
+        adds    x9, x9, x3
+        adc     x4, xzr, xzr
+        subs    x1, x17, x8
+        lsl     x17, x1, #32
+        sbcs    x8, x2, x9
+        sbcs    x9, x12, x4
+        add     x17, x17, x1
+        mov     x1, v18.d[1]
+        lsr     x2, x17, #32
+        sbcs    x7, x7, xzr
+        mov     x12, v18.d[0]
+        sbcs    x15, x15, xzr
+        sbc     x3, x3, xzr
+        subs    x4, x2, x17
+        sbc     x2, x17, xzr
+        adds    x12, x13, x12
+        adcs    x16, x16, x1
+        lsr     x13, x2, #32
+        extr    x1, x2, x4, #32
+        adc     x2, x14, xzr
+        adds    x4, x13, x17
+        mul     x13, x6, x6
+        adc     x14, xzr, xzr
+        subs    x1, x8, x1
+        sbcs    x4, x9, x4
+        mov     x9, v28.d[0]
+        sbcs    x7, x7, x14
+        sbcs    x8, x15, xzr
+        sbcs    x3, x3, xzr
+        sbc     x14, x17, xzr
+        adds    x17, x9, x9
+        adcs    x12, x12, x12
+        mov     x15, v19.d[0]
+        adcs    x9, x16, x16
+        umulh   x6, x6, x6
+        adcs    x16, x2, x2
+        adc     x2, xzr, xzr
+        adds    x11, x11, x8
+        adcs    x3, x3, xzr
+        adcs    x14, x14, xzr
+        adcs    x8, xzr, xzr
+        adds    x13, x1, x13
+        mov     x1, v19.d[1]
+        adcs    x6, x4, x6
+        mov     x4, #0xffffffff                 // #4294967295
+        adcs    x15, x7, x15
+        adcs    x7, x11, x5
+        adcs    x1, x3, x1
+        adcs    x14, x14, x10
+        adc     x11, x8, xzr
+        adds    x6, x6, x17
+        adcs    x8, x15, x12
+        adcs    x3, x7, x9
+        adcs    x15, x1, x16
+        mov     x16, #0xffffffff00000001        // #-4294967295
+        adcs    x14, x14, x2
+        mov     x2, #0x1                        // #1
+        adc     x17, x11, xzr
+        cmn     x13, x16
+        adcs    xzr, x6, x4
+        adcs    xzr, x8, x2
+        adcs    xzr, x3, xzr
+        adcs    xzr, x15, xzr
+        adcs    xzr, x14, xzr
+        adc     x1, x17, xzr
+        neg     x9, x1
+        and     x1, x16, x9
+        adds    x11, x13, x1
+        and     x13, x4, x9
+        adcs    x5, x6, x13
+        and     x1, x2, x9
+        adcs    x7, x8, x1
+        stp     x11, x5, [sp, #192]
+        adcs    x11, x3, xzr
+        adcs    x2, x15, xzr
+        stp     x7, x11, [sp, #208]
+        adc     x17, x14, xzr
+        stp     x2, x17, [sp, #224]
+        ldp     x5, x6, [sp, #240]
+        ldp     x4, x3, [sp, #48]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #256]
+        ldp     x4, x3, [sp, #64]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        ldp     x9, x10, [sp, #272]
+        ldp     x4, x3, [sp, #80]
+        sbcs    x9, x9, x4
+        sbcs    x10, x10, x3
+        csetm   x3, cc  // cc = lo, ul, last
+        mov     x4, #0xffffffff                 // #4294967295
+        and     x4, x4, x3
+        adds    x5, x5, x4
+        eor     x4, x4, x3
+        adcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe         // #-2
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x5, x6, [x25, #96]
+        stp     x7, x8, [x25, #112]
+        stp     x9, x10, [x25, #128]
+        ldr     q3, [sp, #288]
+        ldr     q25, [sp, #96]
+        ldp     x13, x23, [sp, #96]
+        ldp     x3, x21, [sp, #288]
+        rev64   v23.4s, v25.4s
+        uzp1    v17.4s, v25.4s, v3.4s
+        umulh   x15, x3, x13
+        mul     v6.4s, v23.4s, v3.4s
+        uzp1    v3.4s, v3.4s, v3.4s
+        ldr     q27, [sp, #128]
+        ldp     x8, x24, [sp, #304]
+        subs    x6, x3, x21
+        ldr     q0, [sp, #320]
+        movi    v23.2d, #0xffffffff
+        csetm   x10, cc  // cc = lo, ul, last
+        umulh   x19, x21, x23
+        rev64   v4.4s, v27.4s
+        uzp2    v25.4s, v27.4s, v27.4s
+        cneg    x4, x6, cc  // cc = lo, ul, last
+        subs    x7, x23, x13
+        xtn     v22.2s, v0.2d
+        xtn     v24.2s, v27.2d
+        cneg    x20, x7, cc  // cc = lo, ul, last
+        ldp     x6, x14, [sp, #112]
+        mul     v27.4s, v4.4s, v0.4s
+        uaddlp  v20.2d, v6.4s
+        cinv    x5, x10, cc  // cc = lo, ul, last
+        mul     x16, x4, x20
+        uzp2    v6.4s, v0.4s, v0.4s
+        umull   v21.2d, v22.2s, v25.2s
+        shl     v0.2d, v20.2d, #32
+        umlal   v0.2d, v3.2s, v17.2s
+        mul     x22, x8, x6
+        umull   v1.2d, v6.2s, v25.2s
+        subs    x12, x3, x8
+        umull   v20.2d, v22.2s, v24.2s
+        cneg    x17, x12, cc  // cc = lo, ul, last
+        umulh   x9, x8, x6
+        mov     x12, v0.d[1]
+        eor     x11, x16, x5
+        mov     x7, v0.d[0]
+        csetm   x10, cc  // cc = lo, ul, last
+        usra    v21.2d, v20.2d, #32
+        adds    x15, x15, x12
+        adcs    x12, x19, x22
+        umulh   x20, x4, x20
+        adc     x19, x9, xzr
+        usra    v1.2d, v21.2d, #32
+        adds    x22, x15, x7
+        and     v26.16b, v21.16b, v23.16b
+        adcs    x16, x12, x15
+        uaddlp  v25.2d, v27.4s
+        adcs    x9, x19, x12
+        umlal   v26.2d, v6.2s, v24.2s
+        adc     x4, x19, xzr
+        adds    x16, x16, x7
+        shl     v27.2d, v25.2d, #32
+        adcs    x9, x9, x15
+        adcs    x4, x4, x12
+        eor     x12, x20, x5
+        adc     x15, x19, xzr
+        subs    x20, x6, x13
+        cneg    x20, x20, cc  // cc = lo, ul, last
+        cinv    x10, x10, cc  // cc = lo, ul, last
+        cmn     x5, #0x1
+        mul     x19, x17, x20
+        adcs    x11, x22, x11
+        adcs    x12, x16, x12
+        adcs    x9, x9, x5
+        umulh   x17, x17, x20
+        adcs    x22, x4, x5
+        adc     x5, x15, x5
+        subs    x16, x21, x8
+        cneg    x20, x16, cc  // cc = lo, ul, last
+        eor     x19, x19, x10
+        csetm   x4, cc  // cc = lo, ul, last
+        subs    x16, x6, x23
+        cneg    x16, x16, cc  // cc = lo, ul, last
+        umlal   v27.2d, v22.2s, v24.2s
+        mul     x15, x20, x16
+        cinv    x4, x4, cc  // cc = lo, ul, last
+        cmn     x10, #0x1
+        usra    v1.2d, v26.2d, #32
+        adcs    x19, x12, x19
+        eor     x17, x17, x10
+        adcs    x9, x9, x17
+        adcs    x22, x22, x10
+        lsl     x12, x7, #32
+        umulh   x20, x20, x16
+        eor     x16, x15, x4
+        ldp     x15, x17, [sp, #128]
+        add     x2, x12, x7
+        adc     x7, x5, x10
+        ldp     x5, x10, [sp, #320]
+        lsr     x1, x2, #32
+        eor     x12, x20, x4
+        subs    x1, x1, x2
+        sbc     x20, x2, xzr
+        cmn     x4, #0x1
+        adcs    x9, x9, x16
+        extr    x1, x20, x1, #32
+        lsr     x20, x20, #32
+        adcs    x22, x22, x12
+        adc     x16, x7, x4
+        adds    x12, x20, x2
+        umulh   x7, x24, x14
+        adc     x4, xzr, xzr
+        subs    x1, x11, x1
+        sbcs    x20, x19, x12
+        sbcs    x12, x9, x4
+        lsl     x9, x1, #32
+        add     x1, x9, x1
+        sbcs    x9, x22, xzr
+        mul     x22, x24, x14
+        sbcs    x16, x16, xzr
+        lsr     x4, x1, #32
+        sbc     x19, x2, xzr
+        subs    x4, x4, x1
+        sbc     x11, x1, xzr
+        extr    x2, x11, x4, #32
+        lsr     x4, x11, #32
+        adds    x4, x4, x1
+        adc     x11, xzr, xzr
+        subs    x2, x20, x2
+        sbcs    x4, x12, x4
+        sbcs    x20, x9, x11
+        lsl     x12, x2, #32
+        add     x2, x12, x2
+        sbcs    x9, x16, xzr
+        lsr     x11, x2, #32
+        sbcs    x19, x19, xzr
+        sbc     x1, x1, xzr
+        subs    x16, x11, x2
+        sbc     x12, x2, xzr
+        extr    x16, x12, x16, #32
+        lsr     x12, x12, #32
+        adds    x11, x12, x2
+        adc     x12, xzr, xzr
+        subs    x26, x4, x16
+        mov     x4, v27.d[0]
+        sbcs    x27, x20, x11
+        sbcs    x20, x9, x12
+        sbcs    x11, x19, xzr
+        sbcs    x9, x1, xzr
+        stp     x20, x11, [sp, #256]
+        mov     x1, v1.d[0]
+        sbc     x20, x2, xzr
+        subs    x12, x24, x5
+        mov     x11, v27.d[1]
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x2, cc  // cc = lo, ul, last
+        subs    x19, x15, x14
+        mov     x12, v1.d[1]
+        cinv    x2, x2, cc  // cc = lo, ul, last
+        cneg    x19, x19, cc  // cc = lo, ul, last
+        stp     x9, x20, [sp, #272]
+        mul     x9, x16, x19
+        adds    x4, x7, x4
+        adcs    x11, x1, x11
+        adc     x1, x12, xzr
+        adds    x20, x4, x22
+        umulh   x19, x16, x19
+        adcs    x7, x11, x4
+        eor     x16, x9, x2
+        adcs    x9, x1, x11
+        adc     x12, x1, xzr
+        adds    x7, x7, x22
+        adcs    x4, x9, x4
+        adcs    x9, x12, x11
+        adc     x12, x1, xzr
+        cmn     x2, #0x1
+        eor     x1, x19, x2
+        adcs    x11, x20, x16
+        adcs    x19, x7, x1
+        adcs    x1, x4, x2
+        adcs    x20, x9, x2
+        adc     x2, x12, x2
+        subs    x12, x24, x10
+        cneg    x16, x12, cc  // cc = lo, ul, last
+        csetm   x12, cc  // cc = lo, ul, last
+        subs    x9, x17, x14
+        cinv    x12, x12, cc  // cc = lo, ul, last
+        cneg    x9, x9, cc  // cc = lo, ul, last
+        subs    x3, x24, x3
+        sbcs    x21, x5, x21
+        mul     x24, x16, x9
+        sbcs    x4, x10, x8
+        ngc     x8, xzr
+        subs    x10, x5, x10
+        eor     x5, x24, x12
+        csetm   x7, cc  // cc = lo, ul, last
+        cneg    x24, x10, cc  // cc = lo, ul, last
+        subs    x10, x17, x15
+        cinv    x7, x7, cc  // cc = lo, ul, last
+        cneg    x10, x10, cc  // cc = lo, ul, last
+        subs    x14, x13, x14
+        sbcs    x15, x23, x15
+        eor     x13, x21, x8
+        mul     x23, x24, x10
+        sbcs    x17, x6, x17
+        eor     x6, x3, x8
+        ngc     x21, xzr
+        umulh   x9, x16, x9
+        cmn     x8, #0x1
+        eor     x3, x23, x7
+        adcs    x23, x6, xzr
+        adcs    x13, x13, xzr
+        eor     x16, x4, x8
+        adc     x16, x16, xzr
+        eor     x4, x17, x21
+        umulh   x17, x24, x10
+        cmn     x21, #0x1
+        eor     x24, x14, x21
+        eor     x6, x15, x21
+        adcs    x15, x24, xzr
+        adcs    x14, x6, xzr
+        adc     x6, x4, xzr
+        cmn     x12, #0x1
+        eor     x4, x9, x12
+        adcs    x19, x19, x5
+        umulh   x5, x23, x15
+        adcs    x1, x1, x4
+        adcs    x10, x20, x12
+        eor     x4, x17, x7
+        adc     x2, x2, x12
+        cmn     x7, #0x1
+        adcs    x12, x1, x3
+        ldp     x17, x24, [sp, #256]
+        mul     x1, x16, x6
+        adcs    x3, x10, x4
+        adc     x2, x2, x7
+        ldp     x7, x4, [sp, #272]
+        adds    x20, x22, x26
+        mul     x10, x13, x14
+        adcs    x11, x11, x27
+        eor     x9, x8, x21
+        adcs    x26, x19, x17
+        stp     x20, x11, [sp, #240]
+        adcs    x27, x12, x24
+        mul     x8, x23, x15
+        adcs    x3, x3, x7
+        adcs    x12, x2, x4
+        adc     x19, xzr, xzr
+        subs    x21, x23, x16
+        umulh   x2, x16, x6
+        stp     x3, x12, [sp, #272]
+        cneg    x3, x21, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        umulh   x11, x13, x14
+        subs    x21, x13, x16
+        eor     x7, x8, x9
+        cneg    x17, x21, cc  // cc = lo, ul, last
+        csetm   x16, cc  // cc = lo, ul, last
+        subs    x21, x6, x15
+        cneg    x22, x21, cc  // cc = lo, ul, last
+        cinv    x21, x24, cc  // cc = lo, ul, last
+        subs    x20, x23, x13
+        umulh   x12, x3, x22
+        cneg    x23, x20, cc  // cc = lo, ul, last
+        csetm   x24, cc  // cc = lo, ul, last
+        subs    x20, x14, x15
+        cinv    x24, x24, cc  // cc = lo, ul, last
+        mul     x22, x3, x22
+        cneg    x3, x20, cc  // cc = lo, ul, last
+        subs    x13, x6, x14
+        cneg    x20, x13, cc  // cc = lo, ul, last
+        cinv    x15, x16, cc  // cc = lo, ul, last
+        adds    x13, x5, x10
+        mul     x4, x23, x3
+        adcs    x11, x11, x1
+        adc     x14, x2, xzr
+        adds    x5, x13, x8
+        adcs    x16, x11, x13
+        umulh   x23, x23, x3
+        adcs    x3, x14, x11
+        adc     x1, x14, xzr
+        adds    x10, x16, x8
+        adcs    x6, x3, x13
+        adcs    x8, x1, x11
+        umulh   x13, x17, x20
+        eor     x1, x4, x24
+        adc     x4, x14, xzr
+        cmn     x24, #0x1
+        adcs    x1, x5, x1
+        eor     x16, x23, x24
+        eor     x11, x1, x9
+        adcs    x23, x10, x16
+        eor     x2, x22, x21
+        adcs    x3, x6, x24
+        mul     x14, x17, x20
+        eor     x17, x13, x15
+        adcs    x13, x8, x24
+        adc     x8, x4, x24
+        cmn     x21, #0x1
+        adcs    x6, x23, x2
+        mov     x16, #0xfffffffffffffffe        // #-2
+        eor     x20, x12, x21
+        adcs    x20, x3, x20
+        eor     x23, x14, x15
+        adcs    x2, x13, x21
+        adc     x8, x8, x21
+        cmn     x15, #0x1
+        ldp     x5, x4, [sp, #240]
+        adcs    x22, x20, x23
+        eor     x23, x22, x9
+        adcs    x17, x2, x17
+        adc     x22, x8, x15
+        cmn     x9, #0x1
+        adcs    x15, x7, x5
+        ldp     x10, x14, [sp, #272]
+        eor     x1, x6, x9
+        lsl     x2, x15, #32
+        adcs    x8, x11, x4
+        adcs    x13, x1, x26
+        eor     x1, x22, x9
+        adcs    x24, x23, x27
+        eor     x11, x17, x9
+        adcs    x23, x11, x10
+        adcs    x7, x1, x14
+        adcs    x17, x9, x19
+        adcs    x20, x9, xzr
+        add     x1, x2, x15
+        lsr     x3, x1, #32
+        adcs    x11, x9, xzr
+        adc     x9, x9, xzr
+        subs    x3, x3, x1
+        sbc     x6, x1, xzr
+        adds    x24, x24, x5
+        adcs    x4, x23, x4
+        extr    x3, x6, x3, #32
+        lsr     x6, x6, #32
+        adcs    x21, x7, x26
+        adcs    x15, x17, x27
+        adcs    x7, x20, x10
+        adcs    x20, x11, x14
+        mov     x14, #0xffffffff                // #4294967295
+        adc     x22, x9, x19
+        adds    x12, x6, x1
+        adc     x10, xzr, xzr
+        subs    x3, x8, x3
+        sbcs    x12, x13, x12
+        lsl     x9, x3, #32
+        add     x3, x9, x3
+        sbcs    x10, x24, x10
+        sbcs    x24, x4, xzr
+        lsr     x9, x3, #32
+        sbcs    x21, x21, xzr
+        sbc     x1, x1, xzr
+        subs    x9, x9, x3
+        sbc     x13, x3, xzr
+        extr    x9, x13, x9, #32
+        lsr     x13, x13, #32
+        adds    x13, x13, x3
+        adc     x6, xzr, xzr
+        subs    x12, x12, x9
+        sbcs    x17, x10, x13
+        lsl     x2, x12, #32
+        sbcs    x10, x24, x6
+        add     x9, x2, x12
+        sbcs    x6, x21, xzr
+        lsr     x5, x9, #32
+        sbcs    x21, x1, xzr
+        sbc     x13, x3, xzr
+        subs    x8, x5, x9
+        sbc     x19, x9, xzr
+        lsr     x12, x19, #32
+        extr    x3, x19, x8, #32
+        adds    x8, x12, x9
+        adc     x1, xzr, xzr
+        subs    x2, x17, x3
+        sbcs    x12, x10, x8
+        sbcs    x5, x6, x1
+        sbcs    x3, x21, xzr
+        sbcs    x19, x13, xzr
+        sbc     x24, x9, xzr
+        adds    x23, x15, x3
+        adcs    x8, x7, x19
+        adcs    x11, x20, x24
+        adc     x9, x22, xzr
+        add     x24, x9, #0x1
+        lsl     x7, x24, #32
+        subs    x21, x24, x7
+        sbc     x10, x7, xzr
+        adds    x6, x2, x21
+        adcs    x7, x12, x10
+        adcs    x24, x5, x24
+        adcs    x13, x23, xzr
+        adcs    x8, x8, xzr
+        adcs    x15, x11, xzr
+        csetm   x23, cc  // cc = lo, ul, last
+        and     x11, x16, x23
+        and     x20, x14, x23
+        adds    x22, x6, x20
+        eor     x3, x20, x23
+        adcs    x5, x7, x3
+        adcs    x14, x24, x11
+        stp     x22, x5, [sp, #240]
+        adcs    x5, x13, x23
+        adcs    x12, x8, x23
+        stp     x14, x5, [sp, #256]
+        adc     x19, x15, x23
+        ldp     x1, x2, [sp, #144]
+        ldp     x3, x4, [sp, #160]
+        ldp     x5, x6, [sp, #176]
+        lsl     x0, x1, #2
+        ldp     x7, x8, [sp, #288]
+        subs    x0, x0, x7
+        extr    x1, x2, x1, #62
+        sbcs    x1, x1, x8
+        ldp     x7, x8, [sp, #304]
+        extr    x2, x3, x2, #62
+        sbcs    x2, x2, x7
+        extr    x3, x4, x3, #62
+        sbcs    x3, x3, x8
+        extr    x4, x5, x4, #62
+        ldp     x7, x8, [sp, #320]
+        sbcs    x4, x4, x7
+        extr    x5, x6, x5, #62
+        sbcs    x5, x5, x8
+        lsr     x6, x6, #62
+        adc     x6, x6, xzr
+        lsl     x7, x6, #32
+        subs    x8, x6, x7
+        sbc     x7, x7, xzr
+        adds    x0, x0, x8
+        adcs    x1, x1, x7
+        adcs    x2, x2, x6
+        adcs    x3, x3, xzr
+        adcs    x4, x4, xzr
+        adcs    x5, x5, xzr
+        csetm   x8, cc  // cc = lo, ul, last
+        mov     x9, #0xffffffff                 // #4294967295
+        and     x9, x9, x8
+        adds    x0, x0, x9
+        eor     x9, x9, x8
+        adcs    x1, x1, x9
+        mov     x9, #0xfffffffffffffffe         // #-2
+        and     x9, x9, x8
+        adcs    x2, x2, x9
+        adcs    x3, x3, x8
+        adcs    x4, x4, x8
+        adc     x5, x5, x8
+        stp     x0, x1, [x25]
+        stp     x2, x3, [x25, #16]
+        stp     x4, x5, [x25, #32]
+        ldp     x0, x1, [sp, #192]
+        mov     x6, #0xffffffff                 // #4294967295
+        subs    x6, x6, x0
+        mov     x7, #0xffffffff00000000         // #-4294967296
+        sbcs    x7, x7, x1
+        ldp     x0, x1, [sp, #208]
+        mov     x8, #0xfffffffffffffffe         // #-2
+        sbcs    x8, x8, x0
+        mov     x13, #0xffffffffffffffff        // #-1
+        sbcs    x9, x13, x1
+        ldp     x0, x1, [sp, #224]
+        sbcs    x10, x13, x0
+        sbc     x11, x13, x1
+        lsl     x0, x6, #3
+        extr    x1, x7, x6, #61
+        extr    x2, x8, x7, #61
+        extr    x3, x9, x8, #61
+        extr    x4, x10, x9, #61
+        extr    x5, x11, x10, #61
+        lsr     x6, x11, #61
+        add     x6, x6, #0x1
+        ldp     x8, x9, [sp, #240]
+        ldp     x10, x11, [sp, #256]
+        mov     x14, #0x3                       // #3
+        mul     x15, x14, x8
+        umulh   x8, x14, x8
+        adds    x0, x0, x15
+        mul     x15, x14, x9
+        umulh   x9, x14, x9
+        adcs    x1, x1, x15
+        mul     x15, x14, x10
+        umulh   x10, x14, x10
+        adcs    x2, x2, x15
+        mul     x15, x14, x11
+        umulh   x11, x14, x11
+        adcs    x3, x3, x15
+        mul     x15, x14, x12
+        umulh   x12, x14, x12
+        adcs    x4, x4, x15
+        mul     x15, x14, x19
+        umulh   x13, x14, x19
+        adcs    x5, x5, x15
+        adc     x6, x6, xzr
+        adds    x1, x1, x8
+        adcs    x2, x2, x9
+        adcs    x3, x3, x10
+        adcs    x4, x4, x11
+        adcs    x5, x5, x12
+        adcs    x6, x6, x13
+        lsl     x7, x6, #32
+        subs    x8, x6, x7
+        sbc     x7, x7, xzr
+        adds    x0, x0, x8
+        adcs    x1, x1, x7
+        adcs    x2, x2, x6
+        adcs    x3, x3, xzr
+        adcs    x4, x4, xzr
+        adcs    x5, x5, xzr
+        csetm   x6, cc  // cc = lo, ul, last
+        mov     x7, #0xffffffff                 // #4294967295
+        and     x7, x7, x6
+        adds    x0, x0, x7
+        eor     x7, x7, x6
+        adcs    x1, x1, x7
+        mov     x7, #0xfffffffffffffffe         // #-2
+        and     x7, x7, x6
+        adcs    x2, x2, x7
+        adcs    x3, x3, x6
+        adcs    x4, x4, x6
+        adc     x5, x5, x6
+        stp     x0, x1, [x25, #48]
+        stp     x2, x3, [x25, #64]
+        stp     x4, x5, [x25, #80]
+        ldp     x19, x20, [sp, #336]
+        ldp     x21, x22, [sp, #352]
+        ldp     x23, x24, [sp, #368]
+        ldp     x25, x26, [sp, #384]
+        ldp     x27, xzr, [sp, #400]
+        add     sp, sp, #0x1a0
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul_alt.S
similarity index 99%
rename from third_party/s2n-bignum/arm/p384/p384_montjscalarmul_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul_alt.S
index 9f47090a8c0..5dfba9c862c 100644
--- a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul_alt.S
@@ -61,42 +61,42 @@
 // which doesn't accept repetitions, assembler macros etc.
 
 #define selectblock(I)                            \
-        cmp     bf, #(1*I);                       \
-        ldp     x20, x21, [x19];                  \
-        csel    x0, x20, x0, eq;                  \
-        csel    x1, x21, x1, eq;                  \
-        ldp     x20, x21, [x19, #16];             \
-        csel    x2, x20, x2, eq;                  \
-        csel    x3, x21, x3, eq;                  \
-        ldp     x20, x21, [x19, #32];             \
-        csel    x4, x20, x4, eq;                  \
-        csel    x5, x21, x5, eq;                  \
-        ldp     x20, x21, [x19, #48];             \
-        csel    x6, x20, x6, eq;                  \
-        csel    x7, x21, x7, eq;                  \
-        ldp     x20, x21, [x19, #64];             \
-        csel    x8, x20, x8, eq;                  \
-        csel    x9, x21, x9, eq;                  \
-        ldp     x20, x21, [x19, #80];             \
-        csel    x10, x20, x10, eq;                \
-        csel    x11, x21, x11, eq;                \
-        ldp     x20, x21, [x19, #96];             \
-        csel    x12, x20, x12, eq;                \
-        csel    x13, x21, x13, eq;                \
-        ldp     x20, x21, [x19, #112];            \
-        csel    x14, x20, x14, eq;                \
-        csel    x15, x21, x15, eq;                \
-        ldp     x20, x21, [x19, #128];            \
-        csel    x16, x20, x16, eq;                \
-        csel    x17, x21, x17, eq;                \
+        cmp     bf, #(1*I) __LF                      \
+        ldp     x20, x21, [x19] __LF                 \
+        csel    x0, x20, x0, eq __LF                 \
+        csel    x1, x21, x1, eq __LF                 \
+        ldp     x20, x21, [x19, #16] __LF            \
+        csel    x2, x20, x2, eq __LF                 \
+        csel    x3, x21, x3, eq __LF                 \
+        ldp     x20, x21, [x19, #32] __LF            \
+        csel    x4, x20, x4, eq __LF                 \
+        csel    x5, x21, x5, eq __LF                 \
+        ldp     x20, x21, [x19, #48] __LF            \
+        csel    x6, x20, x6, eq __LF                 \
+        csel    x7, x21, x7, eq __LF                 \
+        ldp     x20, x21, [x19, #64] __LF            \
+        csel    x8, x20, x8, eq __LF                 \
+        csel    x9, x21, x9, eq __LF                 \
+        ldp     x20, x21, [x19, #80] __LF            \
+        csel    x10, x20, x10, eq __LF               \
+        csel    x11, x21, x11, eq __LF               \
+        ldp     x20, x21, [x19, #96] __LF            \
+        csel    x12, x20, x12, eq __LF               \
+        csel    x13, x21, x13, eq __LF               \
+        ldp     x20, x21, [x19, #112] __LF           \
+        csel    x14, x20, x14, eq __LF               \
+        csel    x15, x21, x15, eq __LF               \
+        ldp     x20, x21, [x19, #128] __LF           \
+        csel    x16, x20, x16, eq __LF               \
+        csel    x17, x21, x17, eq __LF               \
         add     x19, x19, #JACSIZE
 
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 S2N_BN_SYMBOL(p384_montjscalarmul_alt):
diff --git a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montmul_p384_base.S
similarity index 87%
rename from third_party/s2n-bignum/arm/p384/bignum_montmul_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montmul_p384_base.S
index 05c3d1786a8..cda6a1571b0 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montmul_p384_base.S
@@ -5,7 +5,7 @@
 // Montgomery multiply, z := (x * y / 2^384) mod p_384
 // Inputs x[6], y[6]; output z[6]
 //
-//    extern void bignum_montmul_p384
+//    extern void bignum_montmul_p384_base
 //     (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
 //
 // Does z := (2^{-384} * x * y) mod p_384, assuming that the inputs x and y
@@ -16,8 +16,8 @@
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384_base)
         .text
         .balign 4
 
@@ -28,15 +28,15 @@
 // ---------------------------------------------------------------------------
 
 #define muldiffn(c,h,l, t, x,y, w,z)    \
-        subs    t, x, y;                \
-        cneg    t, t, cc;               \
-        csetm   c, cc;                  \
-        subs    h, w, z;                \
-        cneg    h, h, cc;               \
-        mul     l, t, h;                \
-        umulh   h, t, h;                \
-        cinv    c, c, cc;               \
-        eor     l, l, c;                \
+        subs    t, x, y __LF               \
+        cneg    t, t, cc __LF              \
+        csetm   c, cc __LF                 \
+        subs    h, w, z __LF               \
+        cneg    h, h, cc __LF              \
+        mul     l, t, h __LF               \
+        umulh   h, t, h __LF               \
+        cinv    c, c, cc __LF              \
+        eor     l, l, c __LF               \
         eor     h, h, c
 
 // ---------------------------------------------------------------------------
@@ -52,27 +52,27 @@
 #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
 /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64            */  \
 /* Recycle d0 (which we know gets implicitly cancelled) to store it     */  \
-        lsl     t1, d0, #32;                                        \
-        add     d0, t1, d0;                                         \
+        lsl     t1, d0, #32 __LF                                       \
+        add     d0, t1, d0 __LF                                        \
 /* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32)     */  \
 /* We need to subtract 2^32 * this, and we can ignore its lower 32      */  \
 /* bits since by design it will cancel anyway; we only need the w_hi    */  \
 /* part to get the carry propagation going.                             */  \
-        lsr     t1, d0, #32;                                        \
-        subs    t1, t1, d0;                                         \
-        sbc     t2, d0, xzr;                                        \
+        lsr     t1, d0, #32 __LF                                       \
+        subs    t1, t1, d0 __LF                                        \
+        sbc     t2, d0, xzr __LF                                       \
 /* Now select in t1 the field to subtract from d1                       */  \
-        extr    t1, t2, t1, #32;                                    \
+        extr    t1, t2, t1, #32 __LF                                   \
 /* And now get the terms to subtract from d2 and d3                     */  \
-        lsr     t2, t2, #32;                                        \
-        adds    t2, t2, d0;                                         \
-        adc     t3, xzr, xzr;                                       \
+        lsr     t2, t2, #32 __LF                                       \
+        adds    t2, t2, d0 __LF                                        \
+        adc     t3, xzr, xzr __LF                                      \
 /* Do the subtraction of that portion                                   */  \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
 /* Now effectively add 2^384 * w by taking d0 as the input for last sbc */  \
         sbc     d6, d0, xzr
 
@@ -102,7 +102,7 @@
 #define t3 x23
 #define t4 x24
 
-S2N_BN_SYMBOL(bignum_montmul_p384):
+S2N_BN_SYMBOL(bignum_montmul_p384_base):
 
 // Save some registers
 
diff --git a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montsqr_p384_base.S
similarity index 84%
rename from third_party/s2n-bignum/arm/p384/bignum_montsqr_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montsqr_p384_base.S
index fd55c1bf029..410ae8f4a89 100644
--- a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montsqr_p384_base.S
@@ -5,7 +5,7 @@
 // Montgomery square, z := (x^2 / 2^384) mod p_384
 // Input x[6]; output z[6]
 //
-//    extern void bignum_montsqr_p384
+//    extern void bignum_montsqr_p384_base
 //     (uint64_t z[static 6], uint64_t x[static 6]);
 //
 // Does z := (x^2 / 2^384) mod p_384, assuming x^2 <= 2^384 * p_384, which is
@@ -15,8 +15,8 @@
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384_base)
         .text
         .balign 4
 
@@ -27,15 +27,15 @@
 // ---------------------------------------------------------------------------
 
 #define muldiffn(c,h,l, t, x,y, w,z)    \
-        subs    t, x, y;                \
-        cneg    t, t, cc;               \
-        csetm   c, cc;                  \
-        subs    h, w, z;                \
-        cneg    h, h, cc;               \
-        mul     l, t, h;                \
-        umulh   h, t, h;                \
-        cinv    c, c, cc;               \
-        eor     l, l, c;                \
+        subs    t, x, y __LF               \
+        cneg    t, t, cc __LF              \
+        csetm   c, cc __LF                 \
+        subs    h, w, z __LF               \
+        cneg    h, h, cc __LF              \
+        mul     l, t, h __LF               \
+        umulh   h, t, h __LF               \
+        cinv    c, c, cc __LF              \
+        eor     l, l, c __LF               \
         eor     h, h, c
 
 // ---------------------------------------------------------------------------
@@ -51,27 +51,27 @@
 #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
 /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64            */  \
 /* Recycle d0 (which we know gets implicitly cancelled) to store it     */  \
-        lsl     t1, d0, #32;                                        \
-        add     d0, t1, d0;                                         \
+        lsl     t1, d0, #32 __LF                                       \
+        add     d0, t1, d0 __LF                                        \
 /* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32)     */  \
 /* We need to subtract 2^32 * this, and we can ignore its lower 32      */  \
 /* bits since by design it will cancel anyway; we only need the w_hi    */  \
 /* part to get the carry propagation going.                             */  \
-        lsr     t1, d0, #32;                                        \
-        subs    t1, t1, d0;                                         \
-        sbc     t2, d0, xzr;                                        \
+        lsr     t1, d0, #32 __LF                                       \
+        subs    t1, t1, d0 __LF                                        \
+        sbc     t2, d0, xzr __LF                                       \
 /* Now select in t1 the field to subtract from d1                       */  \
-        extr    t1, t2, t1, #32;                                    \
+        extr    t1, t2, t1, #32 __LF                                   \
 /* And now get the terms to subtract from d2 and d3                     */  \
-        lsr     t2, t2, #32;                                        \
-        adds    t2, t2, d0;                                         \
-        adc     t3, xzr, xzr;                                       \
+        lsr     t2, t2, #32 __LF                                       \
+        adds    t2, t2, d0 __LF                                        \
+        adc     t3, xzr, xzr __LF                                      \
 /* Do the subtraction of that portion                                   */  \
-        subs    d1, d1, t1;                                         \
-        sbcs    d2, d2, t2;                                         \
-        sbcs    d3, d3, t3;                                         \
-        sbcs    d4, d4, xzr;                                        \
-        sbcs    d5, d5, xzr;                                        \
+        subs    d1, d1, t1 __LF                                        \
+        sbcs    d2, d2, t2 __LF                                        \
+        sbcs    d3, d3, t3 __LF                                        \
+        sbcs    d4, d4, xzr __LF                                       \
+        sbcs    d5, d5, xzr __LF                                       \
 /* Now effectively add 2^384 * w by taking d0 as the input for last sbc */  \
         sbc     d6, d0, xzr
 
@@ -93,7 +93,7 @@
 #define d3 x16
 #define d4 x17
 
-S2N_BN_SYMBOL(bignum_montsqr_p384):
+S2N_BN_SYMBOL(bignum_montsqr_p384_base):
 
 // Load in all words of the input
 
diff --git a/third_party/s2n-bignum/arm/p384/unopt/p384_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjadd.S
similarity index 96%
rename from third_party/s2n-bignum/arm/p384/unopt/p384_montjadd.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjadd.S
index cbd6f3cf003..7c7e1545fe1 100644
--- a/third_party/s2n-bignum/arm/p384/unopt/p384_montjadd.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjadd.S
@@ -73,7 +73,7 @@
 
 #define NSPACE (NUMSIZE*7)
 
-// Corresponds to bignum_montmul_p384_neon, with callee-save register spills
+// Corresponds to bignum_montmul_p384, with callee-save register spills
 // rewritten to update sp in advance
 
 .montmul_p384:
@@ -807,33 +807,33 @@
 // Corresponds exactly to bignum_sub_p384
 
 .sub_p384:
-        ldp	x5, x6, [x1]
-        ldp	x4, x3, [x2]
-        subs	x5, x5, x4
-        sbcs	x6, x6, x3
-        ldp	x7, x8, [x1, #16]
-        ldp	x4, x3, [x2, #16]
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        ldp	x9, x10, [x1, #32]
-        ldp	x4, x3, [x2, #32]
-        sbcs	x9, x9, x4
-        sbcs	x10, x10, x3
-        csetm	x3, cc
-        mov	x4, #0xffffffff
-        and	x4, x4, x3
-        adds	x5, x5, x4
-        eor	x4, x4, x3
-        adcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x5, x6, [x0]
-        stp	x7, x8, [x0, #16]
-        stp	x9, x10, [x0, #32]
+        ldp     x5, x6, [x1]
+        ldp     x4, x3, [x2]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x1, #16]
+        ldp     x4, x3, [x2, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        ldp     x9, x10, [x1, #32]
+        ldp     x4, x3, [x2, #32]
+        sbcs    x9, x9, x4
+        sbcs    x10, x10, x3
+        csetm   x3, cc
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adds    x5, x5, x4
+        eor     x4, x4, x3
+        adcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x5, x6, [x0]
+        stp     x7, x8, [x0, #16]
+        stp     x9, x10, [x0, #32]
         ret
 
 
diff --git a/third_party/s2n-bignum/arm/p384/unopt/p384_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjdouble.S
similarity index 62%
rename from third_party/s2n-bignum/arm/p384/unopt/p384_montjdouble.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjdouble.S
index 4cdeeb86997..214fd2a6d91 100644
--- a/third_party/s2n-bignum/arm/p384/unopt/p384_montjdouble.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjdouble.S
@@ -58,7 +58,7 @@
 
 #define NSPACE #(NUMSIZE*7)
 
-// Corresponds exactly to bignum_montmul_p384_neon
+// Corresponds exactly to bignum_montmul_p384
 
 .montmul_p384:
         sub sp, sp, 48
@@ -791,76 +791,76 @@
 // Corresponds exactly to bignum_sub_p384
 
 .sub_p384:
-        ldp	x5, x6, [x1]
-        ldp	x4, x3, [x2]
-        subs	x5, x5, x4
-        sbcs	x6, x6, x3
-        ldp	x7, x8, [x1, #16]
-        ldp	x4, x3, [x2, #16]
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        ldp	x9, x10, [x1, #32]
-        ldp	x4, x3, [x2, #32]
-        sbcs	x9, x9, x4
-        sbcs	x10, x10, x3
-        csetm	x3, cc
-        mov	x4, #0xffffffff
-        and	x4, x4, x3
-        adds	x5, x5, x4
-        eor	x4, x4, x3
-        adcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe
-        and	x4, x4, x3
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        adcs	x9, x9, x3
-        adc	x10, x10, x3
-        stp	x5, x6, [x0]
-        stp	x7, x8, [x0, #16]
-        stp	x9, x10, [x0, #32]
+        ldp     x5, x6, [x1]
+        ldp     x4, x3, [x2]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x1, #16]
+        ldp     x4, x3, [x2, #16]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        ldp     x9, x10, [x1, #32]
+        ldp     x4, x3, [x2, #32]
+        sbcs    x9, x9, x4
+        sbcs    x10, x10, x3
+        csetm   x3, cc
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        adds    x5, x5, x4
+        eor     x4, x4, x3
+        adcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe
+        and     x4, x4, x3
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        adcs    x9, x9, x3
+        adc     x10, x10, x3
+        stp     x5, x6, [x0]
+        stp     x7, x8, [x0, #16]
+        stp     x9, x10, [x0, #32]
         ret
 
 // Corresponds exactly to bignum_add_p384
 
 .add_p384:
-        ldp	x5, x6, [x1]
-        ldp	x4, x3, [x2]
-        adds	x5, x5, x4
-        adcs	x6, x6, x3
-        ldp	x7, x8, [x1, #16]
-        ldp	x4, x3, [x2, #16]
-        adcs	x7, x7, x4
-        adcs	x8, x8, x3
-        ldp	x9, x10, [x1, #32]
-        ldp	x4, x3, [x2, #32]
-        adcs	x9, x9, x4
-        adcs	x10, x10, x3
-        adc	x3, xzr, xzr
-        mov	x4, #0xffffffff
-        cmp	x5, x4
-        mov	x4, #0xffffffff00000000
-        sbcs	xzr, x6, x4
-        mov	x4, #0xfffffffffffffffe
-        sbcs	xzr, x7, x4
-        adcs	xzr, x8, xzr
-        adcs	xzr, x9, xzr
-        adcs	xzr, x10, xzr
-        adcs	x3, x3, xzr
-        csetm	x3, ne
-        mov	x4, #0xffffffff
-        and	x4, x4, x3
-        subs	x5, x5, x4
-        eor	x4, x4, x3
-        sbcs	x6, x6, x4
-        mov	x4, #0xfffffffffffffffe
-        and	x4, x4, x3
-        sbcs	x7, x7, x4
-        sbcs	x8, x8, x3
-        sbcs	x9, x9, x3
-        sbc	x10, x10, x3
-        stp	x5, x6, [x0]
-        stp	x7, x8, [x0, #16]
-        stp	x9, x10, [x0, #32]
+        ldp     x5, x6, [x1]
+        ldp     x4, x3, [x2]
+        adds    x5, x5, x4
+        adcs    x6, x6, x3
+        ldp     x7, x8, [x1, #16]
+        ldp     x4, x3, [x2, #16]
+        adcs    x7, x7, x4
+        adcs    x8, x8, x3
+        ldp     x9, x10, [x1, #32]
+        ldp     x4, x3, [x2, #32]
+        adcs    x9, x9, x4
+        adcs    x10, x10, x3
+        adc     x3, xzr, xzr
+        mov     x4, #0xffffffff
+        cmp     x5, x4
+        mov     x4, #0xffffffff00000000
+        sbcs    xzr, x6, x4
+        mov     x4, #0xfffffffffffffffe
+        sbcs    xzr, x7, x4
+        adcs    xzr, x8, xzr
+        adcs    xzr, x9, xzr
+        adcs    xzr, x10, xzr
+        adcs    x3, x3, xzr
+        csetm   x3, ne
+        mov     x4, #0xffffffff
+        and     x4, x4, x3
+        subs    x5, x5, x4
+        eor     x4, x4, x3
+        sbcs    x6, x6, x4
+        mov     x4, #0xfffffffffffffffe
+        and     x4, x4, x3
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x3
+        sbc     x10, x10, x3
+        stp     x5, x6, [x0]
+        stp     x7, x8, [x0, #16]
+        stp     x9, x10, [x0, #32]
         ret
 
 
@@ -891,248 +891,248 @@
 // P0 = 4 * P1 - P2
 
 #define cmsub41_p384(P0,P1,P2)                  \
-        ldp     x1, x2, [P1];                   \
-        ldp     x3, x4, [P1+16];                \
-        ldp     x5, x6, [P1+32];                \
-        lsl     x0, x1, #2;                     \
-        ldp     x7, x8, [P2];                   \
-        subs    x0, x0, x7;                     \
-        extr    x1, x2, x1, #62;                \
-        sbcs    x1, x1, x8;                     \
-        ldp     x7, x8, [P2+16];                \
-        extr    x2, x3, x2, #62;                \
-        sbcs    x2, x2, x7;                     \
-        extr    x3, x4, x3, #62;                \
-        sbcs    x3, x3, x8;                     \
-        extr    x4, x5, x4, #62;                \
-        ldp     x7, x8, [P2+32];                \
-        sbcs    x4, x4, x7;                     \
-        extr    x5, x6, x5, #62;                \
-        sbcs    x5, x5, x8;                     \
-        lsr     x6, x6, #62;                    \
-        adc     x6, x6, xzr;                    \
-        lsl     x7, x6, #32;                    \
-        subs    x8, x6, x7;                     \
-        sbc     x7, x7, xzr;                    \
-        adds    x0, x0, x8;                     \
-        adcs    x1, x1, x7;                     \
-        adcs    x2, x2, x6;                     \
-        adcs    x3, x3, xzr;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        csetm   x8, cc;                         \
-        mov     x9, #0xffffffff;                \
-        and     x9, x9, x8;                     \
-        adds    x0, x0, x9;                     \
-        eor     x9, x9, x8;                     \
-        adcs    x1, x1, x9;                     \
-        mov     x9, #0xfffffffffffffffe;        \
-        and     x9, x9, x8;                     \
-        adcs    x2, x2, x9;                     \
-        adcs    x3, x3, x8;                     \
-        adcs    x4, x4, x8;                     \
-        adc     x5, x5, x8;                     \
-        stp     x0, x1, [P0];                   \
-        stp     x2, x3, [P0+16];                \
+        ldp     x1, x2, [P1] __LF                  \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P1+32] __LF               \
+        lsl     x0, x1, #2 __LF                    \
+        ldp     x7, x8, [P2] __LF                  \
+        subs    x0, x0, x7 __LF                    \
+        extr    x1, x2, x1, #62 __LF               \
+        sbcs    x1, x1, x8 __LF                    \
+        ldp     x7, x8, [P2+16] __LF               \
+        extr    x2, x3, x2, #62 __LF               \
+        sbcs    x2, x2, x7 __LF                    \
+        extr    x3, x4, x3, #62 __LF               \
+        sbcs    x3, x3, x8 __LF                    \
+        extr    x4, x5, x4, #62 __LF               \
+        ldp     x7, x8, [P2+32] __LF               \
+        sbcs    x4, x4, x7 __LF                    \
+        extr    x5, x6, x5, #62 __LF               \
+        sbcs    x5, x5, x8 __LF                    \
+        lsr     x6, x6, #62 __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        lsl     x7, x6, #32 __LF                   \
+        subs    x8, x6, x7 __LF                    \
+        sbc     x7, x7, xzr __LF                   \
+        adds    x0, x0, x8 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        adcs    x2, x2, x6 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        csetm   x8, cc __LF                        \
+        mov     x9, #0xffffffff __LF               \
+        and     x9, x9, x8 __LF                    \
+        adds    x0, x0, x9 __LF                    \
+        eor     x9, x9, x8 __LF                    \
+        adcs    x1, x1, x9 __LF                    \
+        mov     x9, #0xfffffffffffffffe __LF       \
+        and     x9, x9, x8 __LF                    \
+        adcs    x2, x2, x9 __LF                    \
+        adcs    x3, x3, x8 __LF                    \
+        adcs    x4, x4, x8 __LF                    \
+        adc     x5, x5, x8 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16] __LF               \
         stp     x4, x5, [P0+32]
 
 // P0 = C * P1 - D * P2
 
 #define cmsub_p384(P0,C,P1,D,P2)                \
-        ldp     x0, x1, [P2];                   \
-        mov     x6, #0x00000000ffffffff;        \
-        subs    x6, x6, x0;                     \
-        mov     x7, #0xffffffff00000000;        \
-        sbcs    x7, x7, x1;                     \
-        ldp     x0, x1, [P2+16];                \
-        mov     x8, #0xfffffffffffffffe;        \
-        sbcs    x8, x8, x0;                     \
-        mov     x13, #0xffffffffffffffff;       \
-        sbcs    x9, x13, x1;                    \
-        ldp     x0, x1, [P2+32];                \
-        sbcs    x10, x13, x0;                   \
-        sbc     x11, x13, x1;                   \
-        mov     x12, D;                         \
-        mul     x0, x12, x6;                    \
-        mul     x1, x12, x7;                    \
-        mul     x2, x12, x8;                    \
-        mul     x3, x12, x9;                    \
-        mul     x4, x12, x10;                   \
-        mul     x5, x12, x11;                   \
-        umulh   x6, x12, x6;                    \
-        umulh   x7, x12, x7;                    \
-        umulh   x8, x12, x8;                    \
-        umulh   x9, x12, x9;                    \
-        umulh   x10, x12, x10;                  \
-        umulh   x12, x12, x11;                  \
-        adds    x1, x1, x6;                     \
-        adcs    x2, x2, x7;                     \
-        adcs    x3, x3, x8;                     \
-        adcs    x4, x4, x9;                     \
-        adcs    x5, x5, x10;                    \
-        mov     x6, #1;                         \
-        adc     x6, x12, x6;                    \
-        ldp     x8, x9, [P1];                   \
-        ldp     x10, x11, [P1+16];              \
-        ldp     x12, x13, [P1+32];              \
-        mov     x14, C;                         \
-        mul     x15, x14, x8;                   \
-        umulh   x8, x14, x8;                    \
-        adds    x0, x0, x15;                    \
-        mul     x15, x14, x9;                   \
-        umulh   x9, x14, x9;                    \
-        adcs    x1, x1, x15;                    \
-        mul     x15, x14, x10;                  \
-        umulh   x10, x14, x10;                  \
-        adcs    x2, x2, x15;                    \
-        mul     x15, x14, x11;                  \
-        umulh   x11, x14, x11;                  \
-        adcs    x3, x3, x15;                    \
-        mul     x15, x14, x12;                  \
-        umulh   x12, x14, x12;                  \
-        adcs    x4, x4, x15;                    \
-        mul     x15, x14, x13;                  \
-        umulh   x13, x14, x13;                  \
-        adcs    x5, x5, x15;                    \
-        adc     x6, x6, xzr;                    \
-        adds    x1, x1, x8;                     \
-        adcs    x2, x2, x9;                     \
-        adcs    x3, x3, x10;                    \
-        adcs    x4, x4, x11;                    \
-        adcs    x5, x5, x12;                    \
-        adcs    x6, x6, x13;                    \
-        lsl     x7, x6, #32;                    \
-        subs    x8, x6, x7;                     \
-        sbc     x7, x7, xzr;                    \
-        adds    x0, x0, x8;                     \
-        adcs    x1, x1, x7;                     \
-        adcs    x2, x2, x6;                     \
-        adcs    x3, x3, xzr;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        csetm   x6, cc;                         \
-        mov     x7, #0xffffffff;                \
-        and     x7, x7, x6;                     \
-        adds    x0, x0, x7;                     \
-        eor     x7, x7, x6;                     \
-        adcs    x1, x1, x7;                     \
-        mov     x7, #0xfffffffffffffffe;        \
-        and     x7, x7, x6;                     \
-        adcs    x2, x2, x7;                     \
-        adcs    x3, x3, x6;                     \
-        adcs    x4, x4, x6;                     \
-        adc     x5, x5, x6;                     \
-        stp     x0, x1, [P0];                   \
-        stp     x2, x3, [P0+16];                \
+        ldp     x0, x1, [P2] __LF                  \
+        mov     x6, #0x00000000ffffffff __LF       \
+        subs    x6, x6, x0 __LF                    \
+        mov     x7, #0xffffffff00000000 __LF       \
+        sbcs    x7, x7, x1 __LF                    \
+        ldp     x0, x1, [P2+16] __LF               \
+        mov     x8, #0xfffffffffffffffe __LF       \
+        sbcs    x8, x8, x0 __LF                    \
+        mov     x13, #0xffffffffffffffff __LF      \
+        sbcs    x9, x13, x1 __LF                   \
+        ldp     x0, x1, [P2+32] __LF               \
+        sbcs    x10, x13, x0 __LF                  \
+        sbc     x11, x13, x1 __LF                  \
+        mov     x12, D __LF                        \
+        mul     x0, x12, x6 __LF                   \
+        mul     x1, x12, x7 __LF                   \
+        mul     x2, x12, x8 __LF                   \
+        mul     x3, x12, x9 __LF                   \
+        mul     x4, x12, x10 __LF                  \
+        mul     x5, x12, x11 __LF                  \
+        umulh   x6, x12, x6 __LF                   \
+        umulh   x7, x12, x7 __LF                   \
+        umulh   x8, x12, x8 __LF                   \
+        umulh   x9, x12, x9 __LF                   \
+        umulh   x10, x12, x10 __LF                 \
+        umulh   x12, x12, x11 __LF                 \
+        adds    x1, x1, x6 __LF                    \
+        adcs    x2, x2, x7 __LF                    \
+        adcs    x3, x3, x8 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        mov     x6, #1 __LF                        \
+        adc     x6, x12, x6 __LF                   \
+        ldp     x8, x9, [P1] __LF                  \
+        ldp     x10, x11, [P1+16] __LF             \
+        ldp     x12, x13, [P1+32] __LF             \
+        mov     x14, C __LF                        \
+        mul     x15, x14, x8 __LF                  \
+        umulh   x8, x14, x8 __LF                   \
+        adds    x0, x0, x15 __LF                   \
+        mul     x15, x14, x9 __LF                  \
+        umulh   x9, x14, x9 __LF                   \
+        adcs    x1, x1, x15 __LF                   \
+        mul     x15, x14, x10 __LF                 \
+        umulh   x10, x14, x10 __LF                 \
+        adcs    x2, x2, x15 __LF                   \
+        mul     x15, x14, x11 __LF                 \
+        umulh   x11, x14, x11 __LF                 \
+        adcs    x3, x3, x15 __LF                   \
+        mul     x15, x14, x12 __LF                 \
+        umulh   x12, x14, x12 __LF                 \
+        adcs    x4, x4, x15 __LF                   \
+        mul     x15, x14, x13 __LF                 \
+        umulh   x13, x14, x13 __LF                 \
+        adcs    x5, x5, x15 __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        adds    x1, x1, x8 __LF                    \
+        adcs    x2, x2, x9 __LF                    \
+        adcs    x3, x3, x10 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adcs    x5, x5, x12 __LF                   \
+        adcs    x6, x6, x13 __LF                   \
+        lsl     x7, x6, #32 __LF                   \
+        subs    x8, x6, x7 __LF                    \
+        sbc     x7, x7, xzr __LF                   \
+        adds    x0, x0, x8 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        adcs    x2, x2, x6 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        csetm   x6, cc __LF                        \
+        mov     x7, #0xffffffff __LF               \
+        and     x7, x7, x6 __LF                    \
+        adds    x0, x0, x7 __LF                    \
+        eor     x7, x7, x6 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        mov     x7, #0xfffffffffffffffe __LF       \
+        and     x7, x7, x6 __LF                    \
+        adcs    x2, x2, x7 __LF                    \
+        adcs    x3, x3, x6 __LF                    \
+        adcs    x4, x4, x6 __LF                    \
+        adc     x5, x5, x6 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16] __LF               \
         stp     x4, x5, [P0+32]
 
 // A weak version of add that only guarantees sum in 6 digits
 
 #define weakadd_p384(P0,P1,P2)                  \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        adds    x5, x5, x4;                     \
-        adcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        adcs    x7, x7, x4;                     \
-        adcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        adcs    x9, x9, x4;                     \
-        adcs    x10, x10, x3;                   \
-        csetm   x3, cs;                         \
-        mov     x4, #0xffffffff;                \
-        and     x4, x4, x3;                     \
-        subs    x5, x5, x4;                     \
-        eor     x4, x4, x3;                     \
-        sbcs    x6, x6, x4;                     \
-        mov     x4, #0xfffffffffffffffe;        \
-        and     x4, x4, x3;                     \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        sbcs    x9, x9, x3;                     \
-        sbc     x10, x10, x3;                   \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        adds    x5, x5, x4 __LF                    \
+        adcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x3 __LF                  \
+        csetm   x3, cs __LF                        \
+        mov     x4, #0xffffffff __LF               \
+        and     x4, x4, x3 __LF                    \
+        subs    x5, x5, x4 __LF                    \
+        eor     x4, x4, x3 __LF                    \
+        sbcs    x6, x6, x4 __LF                    \
+        mov     x4, #0xfffffffffffffffe __LF       \
+        and     x4, x4, x3 __LF                    \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbc     x10, x10, x3 __LF                  \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
         stp     x9, x10, [P0+32]
 
 // P0 = 3 * P1 - 8 * P2
 
 #define cmsub38_p384(P0,P1,P2)                  \
-        ldp     x0, x1, [P2];                   \
-        mov     x6, #0x00000000ffffffff;        \
-        subs    x6, x6, x0;                     \
-        mov     x7, #0xffffffff00000000;        \
-        sbcs    x7, x7, x1;                     \
-        ldp     x0, x1, [P2+16];                \
-        mov     x8, #0xfffffffffffffffe;        \
-        sbcs    x8, x8, x0;                     \
-        mov     x13, #0xffffffffffffffff;       \
-        sbcs    x9, x13, x1;                    \
-        ldp     x0, x1, [P2+32];                \
-        sbcs    x10, x13, x0;                   \
-        sbc     x11, x13, x1;                   \
-        lsl     x0, x6, #3;                     \
-        extr    x1, x7, x6, #61;                \
-        extr    x2, x8, x7, #61;                \
-        extr    x3, x9, x8, #61;                \
-        extr    x4, x10, x9, #61;               \
-        extr    x5, x11, x10, #61;              \
-        lsr     x6, x11, #61;                   \
-        add     x6, x6, #1;                     \
-        ldp     x8, x9, [P1];                   \
-        ldp     x10, x11, [P1+16];              \
-        ldp     x12, x13, [P1+32];              \
-        mov     x14, 3;                         \
-        mul     x15, x14, x8;                   \
-        umulh   x8, x14, x8;                    \
-        adds    x0, x0, x15;                    \
-        mul     x15, x14, x9;                   \
-        umulh   x9, x14, x9;                    \
-        adcs    x1, x1, x15;                    \
-        mul     x15, x14, x10;                  \
-        umulh   x10, x14, x10;                  \
-        adcs    x2, x2, x15;                    \
-        mul     x15, x14, x11;                  \
-        umulh   x11, x14, x11;                  \
-        adcs    x3, x3, x15;                    \
-        mul     x15, x14, x12;                  \
-        umulh   x12, x14, x12;                  \
-        adcs    x4, x4, x15;                    \
-        mul     x15, x14, x13;                  \
-        umulh   x13, x14, x13;                  \
-        adcs    x5, x5, x15;                    \
-        adc     x6, x6, xzr;                    \
-        adds    x1, x1, x8;                     \
-        adcs    x2, x2, x9;                     \
-        adcs    x3, x3, x10;                    \
-        adcs    x4, x4, x11;                    \
-        adcs    x5, x5, x12;                    \
-        adcs    x6, x6, x13;                    \
-        lsl     x7, x6, #32;                    \
-        subs    x8, x6, x7;                     \
-        sbc     x7, x7, xzr;                    \
-        adds    x0, x0, x8;                     \
-        adcs    x1, x1, x7;                     \
-        adcs    x2, x2, x6;                     \
-        adcs    x3, x3, xzr;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        csetm   x6, cc;                         \
-        mov     x7, #0xffffffff;                \
-        and     x7, x7, x6;                     \
-        adds    x0, x0, x7;                     \
-        eor     x7, x7, x6;                     \
-        adcs    x1, x1, x7;                     \
-        mov     x7, #0xfffffffffffffffe;        \
-        and     x7, x7, x6;                     \
-        adcs    x2, x2, x7;                     \
-        adcs    x3, x3, x6;                     \
-        adcs    x4, x4, x6;                     \
-        adc     x5, x5, x6;                     \
-        stp     x0, x1, [P0];                   \
-        stp     x2, x3, [P0+16];                \
+        ldp     x0, x1, [P2] __LF                  \
+        mov     x6, #0x00000000ffffffff __LF       \
+        subs    x6, x6, x0 __LF                    \
+        mov     x7, #0xffffffff00000000 __LF       \
+        sbcs    x7, x7, x1 __LF                    \
+        ldp     x0, x1, [P2+16] __LF               \
+        mov     x8, #0xfffffffffffffffe __LF       \
+        sbcs    x8, x8, x0 __LF                    \
+        mov     x13, #0xffffffffffffffff __LF      \
+        sbcs    x9, x13, x1 __LF                   \
+        ldp     x0, x1, [P2+32] __LF               \
+        sbcs    x10, x13, x0 __LF                  \
+        sbc     x11, x13, x1 __LF                  \
+        lsl     x0, x6, #3 __LF                    \
+        extr    x1, x7, x6, #61 __LF               \
+        extr    x2, x8, x7, #61 __LF               \
+        extr    x3, x9, x8, #61 __LF               \
+        extr    x4, x10, x9, #61 __LF              \
+        extr    x5, x11, x10, #61 __LF             \
+        lsr     x6, x11, #61 __LF                  \
+        add     x6, x6, #1 __LF                    \
+        ldp     x8, x9, [P1] __LF                  \
+        ldp     x10, x11, [P1+16] __LF             \
+        ldp     x12, x13, [P1+32] __LF             \
+        mov     x14, 3 __LF                        \
+        mul     x15, x14, x8 __LF                  \
+        umulh   x8, x14, x8 __LF                   \
+        adds    x0, x0, x15 __LF                   \
+        mul     x15, x14, x9 __LF                  \
+        umulh   x9, x14, x9 __LF                   \
+        adcs    x1, x1, x15 __LF                   \
+        mul     x15, x14, x10 __LF                 \
+        umulh   x10, x14, x10 __LF                 \
+        adcs    x2, x2, x15 __LF                   \
+        mul     x15, x14, x11 __LF                 \
+        umulh   x11, x14, x11 __LF                 \
+        adcs    x3, x3, x15 __LF                   \
+        mul     x15, x14, x12 __LF                 \
+        umulh   x12, x14, x12 __LF                 \
+        adcs    x4, x4, x15 __LF                   \
+        mul     x15, x14, x13 __LF                 \
+        umulh   x13, x14, x13 __LF                 \
+        adcs    x5, x5, x15 __LF                   \
+        adc     x6, x6, xzr __LF                   \
+        adds    x1, x1, x8 __LF                    \
+        adcs    x2, x2, x9 __LF                    \
+        adcs    x3, x3, x10 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adcs    x5, x5, x12 __LF                   \
+        adcs    x6, x6, x13 __LF                   \
+        lsl     x7, x6, #32 __LF                   \
+        subs    x8, x6, x7 __LF                    \
+        sbc     x7, x7, xzr __LF                   \
+        adds    x0, x0, x8 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        adcs    x2, x2, x6 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        csetm   x6, cc __LF                        \
+        mov     x7, #0xffffffff __LF               \
+        and     x7, x7, x6 __LF                    \
+        adds    x0, x0, x7 __LF                    \
+        eor     x7, x7, x6 __LF                    \
+        adcs    x1, x1, x7 __LF                    \
+        mov     x7, #0xfffffffffffffffe __LF       \
+        and     x7, x7, x6 __LF                    \
+        adcs    x2, x2, x7 __LF                    \
+        adcs    x3, x3, x6 __LF                    \
+        adcs    x4, x4, x6 __LF                    \
+        adc     x5, x5, x6 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16] __LF               \
         stp     x4, x5, [P0+32]
 
 S2N_BN_SYMBOL(p384_montjdouble):
diff --git a/third_party/s2n-bignum/arm/p521/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/Makefile
similarity index 93%
rename from third_party/s2n-bignum/arm/p521/Makefile
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/Makefile
index 3936b48307c..620ff871d4f 100644
--- a/third_party/s2n-bignum/arm/p521/Makefile
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/Makefile
@@ -33,18 +33,14 @@ OBJ = bignum_add_p521.o \
       bignum_mod_p521_9.o \
       bignum_montmul_p521.o \
       bignum_montmul_p521_alt.o \
-      bignum_montmul_p521_neon.o \
       bignum_montsqr_p521.o \
       bignum_montsqr_p521_alt.o \
-      bignum_montsqr_p521_neon.o \
       bignum_mul_p521.o \
       bignum_mul_p521_alt.o \
-      bignum_mul_p521_neon.o \
       bignum_neg_p521.o \
       bignum_optneg_p521.o \
       bignum_sqr_p521.o \
       bignum_sqr_p521_alt.o \
-      bignum_sqr_p521_neon.o \
       bignum_sub_p521.o \
       bignum_tolebytes_p521.o \
       bignum_tomont_p521.o \
diff --git a/third_party/s2n-bignum/arm/p521/bignum_add_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_add_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_add_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_add_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_cmul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_cmul_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_cmul_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_cmul_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_deamont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_deamont_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_deamont_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_deamont_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_demont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_demont_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_demont_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_demont_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_double_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_double_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_double_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_double_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_fromlebytes_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_fromlebytes_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_fromlebytes_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_fromlebytes_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_half_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_half_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_half_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_half_p521.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_inv_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_inv_p521.S
new file mode 100644
index 00000000000..2962fc2106f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_inv_p521.S
@@ -0,0 +1,1696 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular inverse modulo p_521 =  2^521 - 1
+// Input x[9]; output z[9]
+//
+// extern void bignum_inv_p521(uint64_t z[static 9],uint64_t x[static 9]);
+//
+// Assuming the 9-digit input x is coprime to p_521, i.e. is not divisible
+// by it, returns z < p_521 such that x * z == 1 (mod p_521). Note that
+// x does not need to be reduced modulo p_521, but the output always is.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p521)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p521)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Used for the return pointer
+
+#define res x20
+
+// Loop counter and d = 2 * delta value for divstep
+
+#define i x21
+#define d x22
+
+// Registers used for matrix element magnitudes and signs
+
+#define m00 x10
+#define m01 x11
+#define m10 x12
+#define m11 x13
+#define s00 x14
+#define s01 x15
+#define s10 x16
+#define s11 x17
+
+// Initial carries for combinations
+
+#define car0 x9
+#define car1 x19
+
+// Input and output, plain registers treated according to pattern
+
+#define reg0 x0, #0
+#define reg1 x1, #0
+#define reg2 x2, #0
+#define reg3 x3, #0
+#define reg4 x4, #0
+
+#define x x1, #0
+#define z x0, #0
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f sp, #0
+#define g sp, #(9*N)
+#define u sp, #(18*N)
+#define v sp, #(27*N)
+
+// Total size to reserve on the stack
+
+#define NSPACE #(36*N)
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix in
+// registers as follows
+//
+// [ m00  m01]
+// [ m10  m11]
+
+#define divstep59()                                                     \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x8, x4, #0x100, lsl #12 __LF                               \
+        sbfx    x8, x8, #21, #21 __LF                                      \
+        mov     x11, #0x100000 __LF                                        \
+        add     x11, x11, x11, lsl #21 __LF                                \
+        add     x9, x4, x11 __LF                                           \
+        asr     x9, x9, #42 __LF                                           \
+        add     x10, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x10, x10, #21, #21 __LF                                    \
+        add     x11, x5, x11 __LF                                          \
+        asr     x11, x11, #42 __LF                                         \
+        mul     x6, x8, x2 __LF                                            \
+        mul     x7, x9, x3 __LF                                            \
+        mul     x2, x10, x2 __LF                                           \
+        mul     x3, x11, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #21, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #42 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #21, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #42 __LF                                         \
+        mul     x6, x12, x2 __LF                                           \
+        mul     x7, x13, x3 __LF                                           \
+        mul     x2, x14, x2 __LF                                           \
+        mul     x3, x15, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        mul     x2, x12, x8 __LF                                           \
+        mul     x3, x12, x9 __LF                                           \
+        mul     x6, x14, x8 __LF                                           \
+        mul     x7, x14, x9 __LF                                           \
+        madd    x8, x13, x10, x2 __LF                                      \
+        madd    x9, x13, x11, x3 __LF                                      \
+        madd    x16, x15, x10, x6 __LF                                     \
+        madd    x17, x15, x11, x7 __LF                                     \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #22, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #43 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #22, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #43 __LF                                         \
+        mneg    x2, x12, x8 __LF                                           \
+        mneg    x3, x12, x9 __LF                                           \
+        mneg    x4, x14, x8 __LF                                           \
+        mneg    x5, x14, x9 __LF                                           \
+        msub    m00, x13, x16, x2 __LF                                     \
+        msub    m01, x13, x17, x3 __LF                                     \
+        msub    m10, x15, x16, x4 __LF                                     \
+        msub    m11, x15, x17, x5
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_inv_p521):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        stp     x21, x22, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Copy the prime p_521 = 2^521 - 1 into the f variable
+
+        mov     x10, #0xFFFFFFFFFFFFFFFF
+        stp     x10, x10, [f]
+        stp     x10, x10, [f+16]
+        stp     x10, x10, [f+32]
+        stp     x10, x10, [f+48]
+        mov     x11, #0x1FF
+        str     x11, [f+64]
+
+// Copy the input into the g variable, but reduce it strictly mod p_521
+// so that g <= f as assumed in the bound proof. This code fragment is
+// very similar to bignum_mod_p521_9 complete with carry condensation.
+
+        ldr     x8, [x1, #64]
+        lsr     x9, x8, #9
+
+        subs    xzr, xzr, xzr
+        ldp     x10, x11, [x1]
+        adcs    xzr, x10, x9
+        adcs    xzr, x11, xzr
+        ldp     x12, x13, [x1, #16]
+        and     x7, x12, x13
+        adcs    xzr, x7, xzr
+        ldp     x14, x15, [x1, #32]
+        and     x7, x14, x15
+        adcs    xzr, x7, xzr
+        ldp     x16, x17, [x1, #48]
+        and     x7, x16, x17
+        adcs    xzr, x7, xzr
+        orr     x7, x8, #~0x1FF
+        adcs    x7, x7, xzr
+
+        adcs    x10, x10, x9
+        adcs    x11, x11, xzr
+        adcs    x12, x12, xzr
+        adcs    x13, x13, xzr
+        adcs    x14, x14, xzr
+        adcs    x15, x15, xzr
+        adcs    x16, x16, xzr
+        adcs    x17, x17, xzr
+        adc     x8, x8, xzr
+        and     x8, x8, #0x1FF
+
+        stp     x10, x11, [g]
+        stp     x12, x13, [g+16]
+        stp     x14, x15, [g+32]
+        stp     x16, x17, [g+48]
+        str     x8, [g+64]
+
+// Also maintain weakly reduced < 2*p_521 vector [u,v] such that
+// [f,g] == x * 2^{1239-59*i} * [u,v] (mod p_521)
+// starting with [p_521,x] == x * 2^{1239-59*0} * [0,2^-1239] (mod p_521)
+// Note that because (2^{a+521} == 2^a) (mod p_521) we simply have
+// (2^-1239 == 2^324) (mod p_521) so the constant initializer is simple.
+//
+// Based on the standard divstep bound, for inputs <= 2^b we need at least
+// n >= (9437 * b + 1) / 4096. Since b is 521, that means 1201 iterations.
+// Since we package divstep in multiples of 59 bits, we do 21 blocks of 59
+// making *1239* total. (With a bit more effort we could avoid the full 59
+// divsteps and use a shorter tail computation, but we keep it simple.)
+// Hence, after the 21st iteration we have [f,g] == x * [u,v] and since
+// |f| = 1 we get the modular inverse from u by flipping its sign with f.
+
+        stp     xzr, xzr, [u]
+        stp     xzr, xzr, [u+16]
+        stp     xzr, xzr, [u+32]
+        stp     xzr, xzr, [u+48]
+        str     xzr, [u+64]
+
+        mov     x10, #16
+        stp     xzr, xzr, [v]
+        stp     xzr, xzr, [v+16]
+        stp     xzr, x10, [v+32]
+        stp     xzr, xzr, [v+48]
+        str     xzr, [v+64]
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special 21st iteration after a uniform
+// first 20.
+
+        mov     i, #21
+        mov     d, #1
+        b       bignum_inv_p521_midloop
+
+bignum_inv_p521_loop:
+
+// Separate the matrix elements into sign-magnitude pairs
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in stable registers for the [u,v] part and do [f,g] first.
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+        and     x0, m10, s10
+        and     x1, m11, s11
+        add     car1, x0, x1
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        ldr     x7, [f]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [g]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Digit 1 of [f,g]
+
+        ldr     x7, [f+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [g+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [f]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [g]
+
+// Digit 2 of [f,g]
+
+        ldr     x7, [f+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [g+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [f+N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [g+N]
+
+// Digit 3 of [f,g]
+
+        ldr     x7, [f+3*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        ldr     x8, [g+3*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [f+2*N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [g+2*N]
+
+// Digit 4 of [f,g]
+
+        ldr     x7, [f+4*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        ldr     x8, [g+4*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [f+3*N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [g+3*N]
+
+// Digit 5 of [f,g]
+
+        ldr     x7, [f+5*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        ldr     x8, [g+5*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [f+4*N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [g+4*N]
+
+// Digit 6 of [f,g]
+
+        ldr     x7, [f+6*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [g+6*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [f+5*N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [g+5*N]
+
+// Digit 7 of [f,g]
+
+        ldr     x7, [f+7*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [g+7*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [f+6*N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [g+6*N]
+
+// Digits 8 and 9 of [f,g]
+
+        ldr     x7, [f+8*N]
+        eor     x1, x7, s00
+        asr     x3, x1, #63
+        and     x3, x3, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [g+8*N]
+        eor     x1, x8, s01
+        asr     x0, x1, #63
+        and     x0, x0, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [f+7*N]
+        extr    x5, x3, x5, #59
+        str     x5, [f+8*N]
+
+        eor     x1, x7, s10
+        asr     x5, x1, #63
+        and     x5, x5, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        asr     x0, x1, #63
+        and     x0, x0, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [g+7*N]
+        extr    x2, x5, x2, #59
+        str     x2, [g+8*N]
+
+// Now the computation of the updated u and v values and their
+// modular reductions. A very similar accumulation except that
+// the top words of u and v are unsigned and we don't shift.
+//
+// Digit 0 of [u,v]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v]
+        adc     x3, x3, x1
+
+// Digit 1 of [u,v]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        str     x3, [v+N]
+        adc     x4, x4, x1
+
+// Digit 2 of [u,v]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        str     x4, [v+2*N]
+        adc     x2, x2, x1
+
+// Digit 3 of [u,v]
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        str     x5, [u+3*N]
+        adc     x3, x3, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        str     x2, [v+3*N]
+        adc     x6, x6, x1
+
+// Digit 4 of [u,v]
+
+        ldr     x7, [u+4*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        ldr     x8, [v+4*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x3, x3, x0
+        str     x3, [u+4*N]
+        adc     x4, x4, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x6, x6, x0
+        str     x6, [v+4*N]
+        adc     x5, x5, x1
+
+// Digit 5 of [u,v]
+
+        ldr     x7, [u+5*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v+5*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u+5*N]
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v+5*N]
+        adc     x3, x3, x1
+
+// Digit 6 of [u,v]
+
+        ldr     x7, [u+6*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+6*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+6*N]
+        adc     x6, x6, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        str     x3, [v+6*N]
+        adc     x4, x4, x1
+
+// Digit 7 of [u,v]
+
+        ldr     x7, [u+7*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+7*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+7*N]
+        adc     x5, x5, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        str     x4, [v+7*N]
+        adc     x2, x2, x1
+
+// Digits 8 and 9 of u (top is unsigned)
+
+        ldr     x7, [u+8*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+8*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3
+
+        extr    x6, x3, x5, #9
+        ldp     x0, x1, [u]
+        add     x6, x6, x3, asr #63
+        sub     x5, x5, x6, lsl #9
+        adds    x0, x0, x6
+        asr     x6, x6, #63
+        adcs    x1, x1, x6
+        stp     x0, x1, [u]
+        ldp     x0, x1, [u+16]
+        adcs    x0, x0, x6
+        adcs    x1, x1, x6
+        stp     x0, x1, [u+16]
+        ldp     x0, x1, [u+32]
+        adcs    x0, x0, x6
+        adcs    x1, x1, x6
+        stp     x0, x1, [u+32]
+        ldp     x0, x1, [u+48]
+        adcs    x0, x0, x6
+        adcs    x1, x1, x6
+        stp     x0, x1, [u+48]
+        adc     x5, x5, x6
+        str     x5, [u+64]
+
+// Digits 8 and 9 of v (top is unsigned)
+
+        eor     x1, x7, s10
+        and     x5, s10, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        and     x0, s11, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+
+// Modular reduction of v, reloading as needed from v[0],...,v[7],x2,x5
+
+        extr    x6, x5, x2, #9
+        ldp     x0, x1, [v]
+        add     x6, x6, x5, asr #63
+        sub     x2, x2, x6, lsl #9
+        adds    x0, x0, x6
+        asr     x6, x6, #63
+        adcs    x1, x1, x6
+        stp     x0, x1, [v]
+        ldp     x0, x1, [v+16]
+        adcs    x0, x0, x6
+        adcs    x1, x1, x6
+        stp     x0, x1, [v+16]
+        ldp     x0, x1, [v+32]
+        adcs    x0, x0, x6
+        adcs    x1, x1, x6
+        stp     x0, x1, [v+32]
+        ldp     x0, x1, [v+48]
+        adcs    x0, x0, x6
+        adcs    x1, x1, x6
+        stp     x0, x1, [v+48]
+        adc     x2, x2, x6
+        str     x2, [v+64]
+
+bignum_inv_p521_midloop:
+
+        mov     x1, d
+        ldr     x2, [f]
+        ldr     x3, [g]
+        divstep59()
+        mov     d, x1
+
+// Next iteration
+
+        subs    i, i, #1
+        bne     bignum_inv_p521_loop
+
+// The 21st and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        ldr     x0, [f]
+        ldr     x1, [g]
+        mul     x0, x0, m00
+        madd    x1, x1, m01, x0
+        asr     x0, x1, #63
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * [u,v] (mod p_521)
+// we want to flip the sign of u according to that of f.
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+        eor     s00, s00, x0
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+        eor     s01, s01, x0
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+        eor     s10, s10, x0
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+        eor     s11, s11, x0
+
+// Adjust the initial value to allow for complement instead of negation
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+// Digit 0 of [u]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+// Digit 1 of [u]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+// Digit 2 of [u]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+// Digit 3 of [u]
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, xzr, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        str     x5, [u+3*N]
+        adc     x3, x3, x1
+
+// Digit 4 of [u]
+
+        ldr     x7, [u+4*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        ldr     x8, [v+4*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x3, x3, x0
+        str     x3, [u+4*N]
+        adc     x4, x4, x1
+
+// Digit 5 of [u]
+
+        ldr     x7, [u+5*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v+5*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u+5*N]
+        adc     x2, x2, x1
+
+// Digit 6 of [u]
+
+        ldr     x7, [u+6*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+6*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+6*N]
+        adc     x6, x6, x1
+
+// Digit 7 of [u]
+
+        ldr     x7, [u+7*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+7*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+7*N]
+        adc     x5, x5, x1
+
+// Digits 8 and 9 of u (top is unsigned)
+
+        ldr     x7, [u+8*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+8*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3
+
+        extr    x6, x3, x5, #9
+        ldp     x10, x11, [u]
+        add     x6, x6, x3, asr #63
+        sub     x5, x5, x6, lsl #9
+        adds    x10, x10, x6
+        asr     x6, x6, #63
+        adcs    x11, x11, x6
+        ldp     x12, x13, [u+16]
+        adcs    x12, x12, x6
+        adcs    x13, x13, x6
+        ldp     x14, x15, [u+32]
+        adcs    x14, x14, x6
+        adcs    x15, x15, x6
+        ldp     x16, x17, [u+48]
+        adcs    x16, x16, x6
+        adcs    x17, x17, x6
+        adc     x19, x5, x6
+
+// Further strict reduction ready for the output, which just means
+// a conditional subtraction of p_521
+
+        subs    x0, x10, #-1
+        adcs    x1, x11, xzr
+        adcs    x2, x12, xzr
+        adcs    x3, x13, xzr
+        adcs    x4, x14, xzr
+        adcs    x5, x15, xzr
+        adcs    x6, x16, xzr
+        adcs    x7, x17, xzr
+        mov     x8, #0x1FF
+        sbcs    x8, x19, x8
+
+        csel    x0, x0, x10, cs
+        csel    x1, x1, x11, cs
+        csel    x2, x2, x12, cs
+        csel    x3, x3, x13, cs
+        csel    x4, x4, x14, cs
+        csel    x5, x5, x15, cs
+        csel    x6, x6, x16, cs
+        csel    x7, x7, x17, cs
+        csel    x8, x8, x19, cs
+
+// Store it back to the final output
+
+        stp     x0, x1, [res]
+        stp     x2, x3, [res, #16]
+        stp     x4, x5, [res, #32]
+        stp     x6, x7, [res, #48]
+        str     x8, [res, #64]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p521/bignum_mod_n521_9.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_n521_9.S
similarity index 94%
rename from third_party/s2n-bignum/arm/p521/bignum_mod_n521_9.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_n521_9.S
index d680e5f1db1..6dec50317b5 100644
--- a/third_party/s2n-bignum/arm/p521/bignum_mod_n521_9.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_n521_9.S
@@ -47,9 +47,9 @@
 #define t d7
 
 #define movbig(nn,n3,n2,n1,n0)                                              \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 S2N_BN_SYMBOL(bignum_mod_n521_9):
diff --git a/third_party/s2n-bignum/arm/p521/bignum_mod_p521_9.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_p521_9.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_mod_p521_9.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_p521_9.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521.S
similarity index 99%
rename from third_party/s2n-bignum/arm/p521/bignum_montmul_p521_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521.S
index 9586339f955..a88442df4a6 100644
--- a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521.S
@@ -5,7 +5,7 @@
 // Montgomery multiply, z := (x * y / 2^576) mod p_521
 // Inputs x[9], y[9]; output z[9]
 //
-//    extern void bignum_montmul_p521_neon
+//    extern void bignum_montmul_p521
 //     (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
 //
 // Does z := (x * y / 2^576) mod p_521, assuming x < p_521, y < p_521. This
@@ -17,7 +17,8 @@
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-// bignum_montmul_p521_neon is functionally equivalent to bignum_montmul_p521.
+// bignum_montmul_p521 is functionally equivalent to
+// unopt/bignum_montmul_p521_base.
 // It is written in a way that
 // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully
 //    chosen and vectorized
@@ -717,12 +718,12 @@
 //        # from this file since the sequence is non-deterministically chosen.
 //        # Please add 'ret' at the end of the output assembly.
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521)
         .text
         .balign 4
 
-S2N_BN_SYMBOL(bignum_montmul_p521_neon):
+S2N_BN_SYMBOL(bignum_montmul_p521):
 
 // Save registers and make space for the temporary buffer
 
diff --git a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521_alt.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_montmul_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521_alt.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521.S
similarity index 99%
rename from third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521.S
index 57cf9116156..5d1dccfd539 100644
--- a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521.S
@@ -5,7 +5,7 @@
 // Montgomery square, z := (x^2 / 2^576) mod p_521
 // Input x[9]; output z[9]
 //
-//    extern void bignum_montsqr_p521_neon
+//    extern void bignum_montsqr_p521
 //     (uint64_t z[static 9], uint64_t x[static 9]);
 //
 // Does z := (x^2 / 2^576) mod p_521, assuming x < p_521. This means the
@@ -17,7 +17,8 @@
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521.
+// bignum_montsqr_p521 is functionally equivalent to
+// unopt/bignum_montsqr_p521.
 // It is written in a way that
 // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully
 //    chosen and vectorized
@@ -570,12 +571,12 @@
 //        # from this file since the sequence is non-deterministically chosen.
 //        # Please add 'ret' at the end of the output assembly.
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521)
         .text
         .balign 4
 
-S2N_BN_SYMBOL(bignum_montsqr_p521_neon):
+S2N_BN_SYMBOL(bignum_montsqr_p521):
 
 // Save registers
 
diff --git a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521_alt.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521_alt.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_mul_p521_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521.S
similarity index 99%
rename from third_party/s2n-bignum/arm/p521/bignum_mul_p521_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521.S
index c9d34151d56..5eba505ba7e 100644
--- a/third_party/s2n-bignum/arm/p521/bignum_mul_p521_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521.S
@@ -5,14 +5,14 @@
 // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
 //
-//    extern void bignum_mul_p521_neon
+//    extern void bignum_mul_p521
 //     (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
 //
 // Standard ARM ABI: X0 = z, X1 = x, X2 = y
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-// bignum_mul_p521_neon is functionally equivalent to bignum_mul_p521.
+// bignum_mul_p521 is functionally equivalent to unopt/bignum_mul_p521_base.
 // It is written in a way that
 // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully
 //    chosen and vectorized
@@ -708,12 +708,12 @@
 //        # from this file since the sequence is non-deterministically chosen.
 //        # Please add 'ret' at the end of the output assembly.
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521)
         .text
         .balign 4
 
-S2N_BN_SYMBOL(bignum_mul_p521_neon):
+S2N_BN_SYMBOL(bignum_mul_p521):
 
 // Save registers and make space for the temporary buffer
 
diff --git a/third_party/s2n-bignum/arm/p521/bignum_mul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521_alt.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_mul_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521_alt.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_neg_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_neg_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_neg_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_neg_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_optneg_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_optneg_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_optneg_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_optneg_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521.S
similarity index 99%
rename from third_party/s2n-bignum/arm/p521/bignum_sqr_p521_neon.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521.S
index 13cd1c25419..6c9cac4d7c0 100644
--- a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521_neon.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521.S
@@ -5,14 +5,14 @@
 // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
 //
-//    extern void bignum_sqr_p521_neon (uint64_t z[static 9],
+//    extern void bignum_sqr_p521 (uint64_t z[static 9],
 //                                      uint64_t x[static 9]);
 //
 // Standard ARM ABI: X0 = z, X1 = x
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521.
+// bignum_sqr_p521 is functionally equivalent to unopt/bignum_sqr_p521_base.
 // It is written in a way that
 // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully
 //    chosen and vectorized
@@ -567,12 +567,12 @@
 //        # from this file since the sequence is non-deterministically chosen.
 //        # Please add 'ret' at the end of the output assembly.
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521_neon)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521_neon)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521)
         .text
         .balign 4
 
-S2N_BN_SYMBOL(bignum_sqr_p521_neon):
+S2N_BN_SYMBOL(bignum_sqr_p521):
 
 // Save registers
 
diff --git a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521_alt.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_sqr_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521_alt.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_sub_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sub_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_sub_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sub_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_tolebytes_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_tolebytes_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_tolebytes_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_tolebytes_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_tomont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_tomont_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_tomont_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_tomont_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/bignum_triple_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_triple_p521.S
similarity index 100%
rename from third_party/s2n-bignum/arm/p521/bignum_triple_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_triple_p521.S
diff --git a/third_party/s2n-bignum/arm/p521/p521_jadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd.S
similarity index 98%
rename from third_party/s2n-bignum/arm/p521/p521_jadd.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd.S
index 6dbcad2b7bd..de36c6566a5 100644
--- a/third_party/s2n-bignum/arm/p521/p521_jadd.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd.S
@@ -84,21 +84,21 @@
 // and bignum_sub_p521
 
 #define mul_p521(P0,P1,P2)                      \
-        add     x0, P0;                         \
-        add     x1, P1;                         \
-        add     x2, P2;                         \
-        bl      local_mul_p521
+        add     x0, P0 __LF                        \
+        add     x1, P1 __LF                        \
+        add     x2, P2 __LF                        \
+        bl      p521_jadd_local_mul_p521
 
 #define sqr_p521(P0,P1)                         \
-        add     x0, P0;                         \
-        add     x1, P1;                         \
-        bl      local_sqr_p521
+        add     x0, P0 __LF                        \
+        add     x1, P1 __LF                        \
+        bl      p521_jadd_local_sqr_p521
 
 #define sub_p521(P0,P1,P2)                      \
-        add     x0, P0;                         \
-        add     x1, P1;                         \
-        add     x2, P2;                         \
-        bl      local_sub_p521
+        add     x0, P0 __LF                        \
+        add     x1, P1 __LF                        \
+        add     x2, P2 __LF                        \
+        bl      p521_jadd_local_sub_p521
 
 S2N_BN_SYMBOL(p521_jadd):
 
@@ -348,9 +348,9 @@ S2N_BN_SYMBOL(p521_jadd):
         ret
 
 // Local versions of the three field operations, identical to
-// bignum_mul_p521_neon, bignum_sqr_p521_neon and bignum_sub_p521.
+// bignum_mul_p521, bignum_sqr_p521 and bignum_sub_p521.
 
-local_mul_p521:
+p521_jadd_local_mul_p521:
         stp     x19, x20, [sp, #-16]!
         stp     x21, x22, [sp, #-16]!
         stp     x23, x24, [sp, #-16]!
@@ -1027,7 +1027,7 @@ local_mul_p521:
         ldp     x19, x20, [sp], #16
         ret
 
-local_sqr_p521:
+p521_jadd_local_sqr_p521:
         stp     x19, x20, [sp, #-16]!
         stp     x21, x22, [sp, #-16]!
         stp     x23, x24, [sp, #-16]!
@@ -1563,7 +1563,7 @@ local_sqr_p521:
         ldp     x19, x20, [sp], #16
         ret
 
-local_sub_p521:
+p521_jadd_local_sub_p521:
         ldp     x5, x6, [x1]
         ldp     x4, x3, [x2]
         subs    x5, x5, x4
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd_alt.S
new file mode 100644
index 00000000000..da6166b8138
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd_alt.S
@@ -0,0 +1,979 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on NIST curve P-521 in Jacobian coordinates
+//
+//    extern void p521_jadd_alt
+//      (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input points p1 and p2 are
+// fully reduced mod p_521, that both z coordinates are nonzero and
+// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents
+// the same affine point as".
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 72
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x26
+#define input_x x27
+#define input_y x28
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+#define z_2 input_y, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define x1a sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define z2sq sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define y1a sp, #(NUMSIZE*6)
+
+// NUMSIZE*7 is not 16-aligned so we round it up
+
+#define NSPACE (NUMSIZE*7+8)
+
+// Corresponds exactly to bignum_mul_p521_alt
+
+#define mul_p521(P0,P1,P2)                      \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x15, x3, x5 __LF                   \
+        umulh   x16, x3, x5 __LF                   \
+        mul     x14, x3, x6 __LF                   \
+        umulh   x17, x3, x6 __LF                   \
+        adds    x16, x16, x14 __LF                 \
+        ldp     x7, x8, [P2+16] __LF               \
+        mul     x14, x3, x7 __LF                   \
+        umulh   x19, x3, x7 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        umulh   x20, x3, x8 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        ldp     x9, x10, [P2+32] __LF              \
+        mul     x14, x3, x9 __LF                   \
+        umulh   x21, x3, x9 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        umulh   x22, x3, x10 __LF                  \
+        adcs    x21, x21, x14 __LF                 \
+        ldp     x11, x12, [P2+48] __LF             \
+        mul     x14, x3, x11 __LF                  \
+        umulh   x23, x3, x11 __LF                  \
+        adcs    x22, x22, x14 __LF                 \
+        ldr     x13, [P2+64] __LF                  \
+        mul     x14, x3, x12 __LF                  \
+        umulh   x24, x3, x12 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        umulh   x1, x3, x13 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        mul     x14, x4, x5 __LF                   \
+        adds    x16, x16, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        cset    x0, hs __LF                        \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x0, x0, x14 __LF                   \
+        stp     x15, x16, [P0] __LF                \
+        ldp     x3, x4, [P1+16] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x17, x17, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        cset    x15, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x15, x15, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x19, x19, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        cset    x16, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x16, x16, x14 __LF                 \
+        stp     x17, x19, [P0+16] __LF             \
+        ldp     x3, x4, [P1+32] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x20, x20, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        cset    x17, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x21, x21, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x17, x17, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x21, x21, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        cset    x19, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x19, x19, x14 __LF                 \
+        stp     x20, x21, [P0+32] __LF             \
+        ldp     x3, x4, [P1+48] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x22, x22, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x20, x20, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x23, x23, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        cset    x21, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        stp     x22, x23, [P0+48] __LF             \
+        ldr     x3, [P1+64] __LF                   \
+        mul     x14, x3, x5 __LF                   \
+        adds    x24, x24, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        cmp     xzr, xzr __LF                      \
+        ldp     x5, x6, [P0] __LF                  \
+        extr    x14, x1, x24, #9 __LF              \
+        adcs    x5, x5, x14 __LF                   \
+        extr    x14, x0, x1, #9 __LF               \
+        adcs    x6, x6, x14 __LF                   \
+        ldp     x7, x8, [P0+16] __LF               \
+        extr    x14, x15, x0, #9 __LF              \
+        adcs    x7, x7, x14 __LF                   \
+        extr    x14, x16, x15, #9 __LF             \
+        adcs    x8, x8, x14 __LF                   \
+        ldp     x9, x10, [P0+32] __LF              \
+        extr    x14, x17, x16, #9 __LF             \
+        adcs    x9, x9, x14 __LF                   \
+        extr    x14, x19, x17, #9 __LF             \
+        adcs    x10, x10, x14 __LF                 \
+        ldp     x11, x12, [P0+48] __LF             \
+        extr    x14, x20, x19, #9 __LF             \
+        adcs    x11, x11, x14 __LF                 \
+        extr    x14, x21, x20, #9 __LF             \
+        adcs    x12, x12, x14 __LF                 \
+        orr     x13, x24, #0xfffffffffffffe00 __LF \
+        lsr     x14, x21, #9 __LF                  \
+        adcs    x13, x13, x14 __LF                 \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        and     x13, x13, #0x1ff __LF              \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
+        str     x13, [P0+64]
+
+// Corresponds exactly to bignum_sqr_p521_alt
+
+#define sqr_p521(P0,P1)                         \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x11, x2, x3 __LF                   \
+        umulh   x12, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x10, x2, x4 __LF                   \
+        umulh   x13, x2, x4 __LF                   \
+        adds    x12, x12, x10 __LF                 \
+        ldp     x6, x7, [P1+32] __LF               \
+        mul     x10, x2, x5 __LF                   \
+        umulh   x14, x2, x5 __LF                   \
+        adcs    x13, x13, x10 __LF                 \
+        ldp     x8, x9, [P1+48] __LF               \
+        mul     x10, x2, x6 __LF                   \
+        umulh   x15, x2, x6 __LF                   \
+        adcs    x14, x14, x10 __LF                 \
+        mul     x10, x2, x7 __LF                   \
+        umulh   x16, x2, x7 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        mul     x10, x2, x8 __LF                   \
+        umulh   x17, x2, x8 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        mul     x10, x2, x9 __LF                   \
+        umulh   x19, x2, x9 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        adc     x19, x19, xzr __LF                 \
+        mul     x10, x3, x4 __LF                   \
+        adds    x13, x13, x10 __LF                 \
+        mul     x10, x3, x5 __LF                   \
+        adcs    x14, x14, x10 __LF                 \
+        mul     x10, x3, x6 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        mul     x10, x3, x7 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        mul     x10, x3, x8 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        mul     x10, x3, x9 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x10, x3, x4 __LF                   \
+        adds    x14, x14, x10 __LF                 \
+        umulh   x10, x3, x5 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        umulh   x10, x3, x6 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        umulh   x10, x3, x7 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        umulh   x10, x3, x8 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        umulh   x10, x3, x9 __LF                   \
+        adc     x20, x20, x10 __LF                 \
+        mul     x10, x6, x7 __LF                   \
+        umulh   x21, x6, x7 __LF                   \
+        adds    x20, x20, x10 __LF                 \
+        adc     x21, x21, xzr __LF                 \
+        mul     x10, x4, x5 __LF                   \
+        adds    x15, x15, x10 __LF                 \
+        mul     x10, x4, x6 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        mul     x10, x4, x7 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        mul     x10, x4, x8 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        mul     x10, x4, x9 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x6, x8 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        cset    x22, hs __LF                       \
+        umulh   x10, x4, x5 __LF                   \
+        adds    x16, x16, x10 __LF                 \
+        umulh   x10, x4, x6 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        umulh   x10, x4, x7 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        umulh   x10, x4, x8 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        umulh   x10, x4, x9 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x6, x8 __LF                   \
+        adc     x22, x22, x10 __LF                 \
+        mul     x10, x7, x8 __LF                   \
+        umulh   x23, x7, x8 __LF                   \
+        adds    x22, x22, x10 __LF                 \
+        adc     x23, x23, xzr __LF                 \
+        mul     x10, x5, x6 __LF                   \
+        adds    x17, x17, x10 __LF                 \
+        mul     x10, x5, x7 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        mul     x10, x5, x8 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x5, x9 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        mul     x10, x6, x9 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        mul     x10, x7, x9 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        cset    x24, hs __LF                       \
+        umulh   x10, x5, x6 __LF                   \
+        adds    x19, x19, x10 __LF                 \
+        umulh   x10, x5, x7 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        umulh   x10, x5, x8 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x5, x9 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        umulh   x10, x6, x9 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        umulh   x10, x7, x9 __LF                   \
+        adc     x24, x24, x10 __LF                 \
+        mul     x10, x8, x9 __LF                   \
+        umulh   x25, x8, x9 __LF                   \
+        adds    x24, x24, x10 __LF                 \
+        adc     x25, x25, xzr __LF                 \
+        adds    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adcs    x17, x17, x17 __LF                 \
+        adcs    x19, x19, x19 __LF                 \
+        adcs    x20, x20, x20 __LF                 \
+        adcs    x21, x21, x21 __LF                 \
+        adcs    x22, x22, x22 __LF                 \
+        adcs    x23, x23, x23 __LF                 \
+        adcs    x24, x24, x24 __LF                 \
+        adcs    x25, x25, x25 __LF                 \
+        cset    x0, hs __LF                        \
+        umulh   x10, x2, x2 __LF                   \
+        adds    x11, x11, x10 __LF                 \
+        mul     x10, x3, x3 __LF                   \
+        adcs    x12, x12, x10 __LF                 \
+        umulh   x10, x3, x3 __LF                   \
+        adcs    x13, x13, x10 __LF                 \
+        mul     x10, x4, x4 __LF                   \
+        adcs    x14, x14, x10 __LF                 \
+        umulh   x10, x4, x4 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        mul     x10, x5, x5 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        umulh   x10, x5, x5 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        mul     x10, x6, x6 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        umulh   x10, x6, x6 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x7, x7 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x7, x7 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        mul     x10, x8, x8 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        umulh   x10, x8, x8 __LF                   \
+        adcs    x24, x24, x10 __LF                 \
+        mul     x10, x9, x9 __LF                   \
+        adcs    x25, x25, x10 __LF                 \
+        umulh   x10, x9, x9 __LF                   \
+        adc     x0, x0, x10 __LF                   \
+        ldr     x1, [P1+64] __LF                   \
+        add     x1, x1, x1 __LF                    \
+        mul     x10, x1, x2 __LF                   \
+        adds    x19, x19, x10 __LF                 \
+        umulh   x10, x1, x2 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x1, x4 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x1, x4 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        mul     x10, x1, x6 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        umulh   x10, x1, x6 __LF                   \
+        adcs    x24, x24, x10 __LF                 \
+        mul     x10, x1, x8 __LF                   \
+        adcs    x25, x25, x10 __LF                 \
+        umulh   x10, x1, x8 __LF                   \
+        adcs    x0, x0, x10 __LF                   \
+        lsr     x4, x1, #1 __LF                    \
+        mul     x4, x4, x4 __LF                    \
+        adc     x4, x4, xzr __LF                   \
+        mul     x10, x1, x3 __LF                   \
+        adds    x20, x20, x10 __LF                 \
+        umulh   x10, x1, x3 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        mul     x10, x1, x5 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        umulh   x10, x1, x5 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        mul     x10, x1, x7 __LF                   \
+        adcs    x24, x24, x10 __LF                 \
+        umulh   x10, x1, x7 __LF                   \
+        adcs    x25, x25, x10 __LF                 \
+        mul     x10, x1, x9 __LF                   \
+        adcs    x0, x0, x10 __LF                   \
+        umulh   x10, x1, x9 __LF                   \
+        adc     x4, x4, x10 __LF                   \
+        mul     x2, x2, x2 __LF                    \
+        cmp     xzr, xzr __LF                      \
+        extr    x10, x20, x19, #9 __LF             \
+        adcs    x2, x2, x10 __LF                   \
+        extr    x10, x21, x20, #9 __LF             \
+        adcs    x11, x11, x10 __LF                 \
+        extr    x10, x22, x21, #9 __LF             \
+        adcs    x12, x12, x10 __LF                 \
+        extr    x10, x23, x22, #9 __LF             \
+        adcs    x13, x13, x10 __LF                 \
+        extr    x10, x24, x23, #9 __LF             \
+        adcs    x14, x14, x10 __LF                 \
+        extr    x10, x25, x24, #9 __LF             \
+        adcs    x15, x15, x10 __LF                 \
+        extr    x10, x0, x25, #9 __LF              \
+        adcs    x16, x16, x10 __LF                 \
+        extr    x10, x4, x0, #9 __LF               \
+        adcs    x17, x17, x10 __LF                 \
+        orr     x19, x19, #0xfffffffffffffe00 __LF \
+        lsr     x10, x4, #9 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        sbcs    x2, x2, xzr __LF                   \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbcs    x15, x15, xzr __LF                 \
+        sbcs    x16, x16, xzr __LF                 \
+        sbcs    x17, x17, xzr __LF                 \
+        sbc     x19, x19, xzr __LF                 \
+        and     x19, x19, #0x1ff __LF              \
+        stp     x2, x11, [P0] __LF                 \
+        stp     x12, x13, [P0+16] __LF             \
+        stp     x14, x15, [P0+32] __LF             \
+        stp     x16, x17, [P0+48] __LF             \
+        str     x19, [P0+64]
+
+// Corresponds exactly to bignum_sub_p521
+
+#define sub_p521(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        ldp     x11, x12, [P1+48] __LF             \
+        ldp     x4, x3, [P2+48] __LF               \
+        sbcs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        ldr     x13, [P1+64] __LF                  \
+        ldr     x4, [P2+64] __LF                   \
+        sbcs    x13, x13, x4 __LF                  \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        and     x13, x13, #0x1ff __LF              \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
+        str     x13, [P0+64]
+
+S2N_BN_SYMBOL(p521_jadd_alt):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+        stp     x27, x28, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p521(z1sq,z_1)
+        sqr_p521(z2sq,z_2)
+
+        mul_p521(y1a,z_2,y_1)
+        mul_p521(y2a,z_1,y_2)
+
+        mul_p521(x2a,z1sq,x_2)
+        mul_p521(x1a,z2sq,x_1)
+        mul_p521(y2a,z1sq,y2a)
+        mul_p521(y1a,z2sq,y1a)
+
+        sub_p521(xd,x2a,x1a)
+        sub_p521(yd,y2a,y1a)
+
+        sqr_p521(zz,xd)
+        sqr_p521(ww,yd)
+
+        mul_p521(zzx1,zz,x1a)
+        mul_p521(zzx2,zz,x2a)
+
+        sub_p521(resx,ww,zzx1)
+        sub_p521(t1,zzx2,zzx1)
+
+        mul_p521(xd,xd,z_1)
+
+        sub_p521(resx,resx,zzx2)
+
+        sub_p521(t2,zzx1,resx)
+
+        mul_p521(t1,t1,y1a)
+        mul_p521(resz,xd,z_2)
+        mul_p521(t2,yd,t2)
+
+        sub_p521(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
+// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
+// Multiplex the z outputs accordingly and re-store in resz
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        ldp     x4, x5, [z_1+32]
+        ldp     x6, x7, [z_1+48]
+        ldr     x8, [z_1+64]
+
+        orr     x20, x0, x1
+        orr     x21, x2, x3
+        orr     x22, x4, x5
+        orr     x23, x6, x7
+        orr     x20, x20, x21
+        orr     x22, x22, x23
+        orr     x20, x20, x8
+        orr     x20, x20, x22
+        cmp     x20, xzr
+        cset    x20, ne
+
+        ldp     x10, x11, [z_2]
+        ldp     x12, x13, [z_2+16]
+        ldp     x14, x15, [z_2+32]
+        ldp     x16, x17, [z_2+48]
+        ldr     x19, [z_2+64]
+
+        orr     x21, x10, x11
+        orr     x22, x12, x13
+        orr     x23, x14, x15
+        orr     x24, x16, x17
+        orr     x21, x21, x22
+        orr     x23, x23, x24
+        orr     x21, x21, x19
+        orr     x21, x21, x23
+
+        csel    x0, x0, x10, ne
+        csel    x1, x1, x11, ne
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+        csel    x4, x4, x14, ne
+        csel    x5, x5, x15, ne
+        csel    x6, x6, x16, ne
+        csel    x7, x7, x17, ne
+        csel    x8, x8, x19, ne
+
+        cmp     x21, xzr
+        cset    x21, ne
+
+        cmp     x21, x20
+
+        ldp     x10, x11, [resz]
+        ldp     x12, x13, [resz+16]
+        ldp     x14, x15, [resz+32]
+        ldp     x16, x17, [resz+48]
+        ldr     x19, [resz+64]
+
+        csel    x0, x0, x10, ne
+        csel    x1, x1, x11, ne
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+        csel    x4, x4, x14, ne
+        csel    x5, x5, x15, ne
+        csel    x6, x6, x16, ne
+        csel    x7, x7, x17, ne
+        csel    x8, x8, x19, ne
+
+        stp     x0, x1, [resz]
+        stp     x2, x3, [resz+16]
+        stp     x4, x5, [resz+32]
+        stp     x6, x7, [resz+48]
+        str     x8, [resz+64]
+
+// Multiplex the x and y outputs too, keeping the results in registers
+
+        ldp     x20, x21, [x_1]
+        ldp     x0, x1, [resx]
+        csel    x0, x20, x0, lo
+        csel    x1, x21, x1, lo
+        ldp     x20, x21, [x_2]
+        csel    x0, x20, x0, hi
+        csel    x1, x21, x1, hi
+
+        ldp     x20, x21, [x_1+16]
+        ldp     x2, x3, [resx+16]
+        csel    x2, x20, x2, lo
+        csel    x3, x21, x3, lo
+        ldp     x20, x21, [x_2+16]
+        csel    x2, x20, x2, hi
+        csel    x3, x21, x3, hi
+
+        ldp     x20, x21, [x_1+32]
+        ldp     x4, x5, [resx+32]
+        csel    x4, x20, x4, lo
+        csel    x5, x21, x5, lo
+        ldp     x20, x21, [x_2+32]
+        csel    x4, x20, x4, hi
+        csel    x5, x21, x5, hi
+
+        ldp     x20, x21, [x_1+48]
+        ldp     x6, x7, [resx+48]
+        csel    x6, x20, x6, lo
+        csel    x7, x21, x7, lo
+        ldp     x20, x21, [x_2+48]
+        csel    x6, x20, x6, hi
+        csel    x7, x21, x7, hi
+
+        ldr     x20, [x_1+64]
+        ldr     x8, [resx+64]
+        csel    x8, x20, x8, lo
+        ldr     x21, [x_2+64]
+        csel    x8, x21, x8, hi
+
+
+        ldp     x20, x21, [y_1]
+        ldp     x10, x11, [resy]
+        csel    x10, x20, x10, lo
+        csel    x11, x21, x11, lo
+        ldp     x20, x21, [y_2]
+        csel    x10, x20, x10, hi
+        csel    x11, x21, x11, hi
+
+        ldp     x20, x21, [y_1+16]
+        ldp     x12, x13, [resy+16]
+        csel    x12, x20, x12, lo
+        csel    x13, x21, x13, lo
+        ldp     x20, x21, [y_2+16]
+        csel    x12, x20, x12, hi
+        csel    x13, x21, x13, hi
+
+        ldp     x20, x21, [y_1+32]
+        ldp     x14, x15, [resy+32]
+        csel    x14, x20, x14, lo
+        csel    x15, x21, x15, lo
+        ldp     x20, x21, [y_2+32]
+        csel    x14, x20, x14, hi
+        csel    x15, x21, x15, hi
+
+        ldp     x20, x21, [y_1+48]
+        ldp     x16, x17, [resy+48]
+        csel    x16, x20, x16, lo
+        csel    x17, x21, x17, lo
+        ldp     x20, x21, [y_2+48]
+        csel    x16, x20, x16, hi
+        csel    x17, x21, x17, hi
+
+        ldr     x20, [y_1+64]
+        ldr     x19, [resy+64]
+        csel    x19, x20, x19, lo
+        ldr     x21, [y_2+64]
+        csel    x19, x21, x19, hi
+
+// Finally store back the multiplexed values
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [x_3+32]
+        stp     x6, x7, [x_3+48]
+        str     x8, [x_3+64]
+
+        ldp     x0, x1, [resz]
+        ldp     x2, x3, [resz+16]
+        ldp     x4, x5, [resz+32]
+        ldp     x6, x7, [resz+48]
+        ldr     x8, [resz+64]
+
+        stp     x10, x11, [y_3]
+        stp     x12, x13, [y_3+16]
+        stp     x14, x15, [y_3+32]
+        stp     x16, x17, [y_3+48]
+        str     x19, [y_3+64]
+
+        stp     x0, x1, [z_3]
+        stp     x2, x3, [z_3+16]
+        stp     x4, x5, [z_3+32]
+        stp     x6, x7, [z_3+48]
+        str     x8, [z_3+64]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+
+        ldp     x27, x28, [sp], 16
+        ldp     x25, x26, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p521/p521_jdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble.S
similarity index 68%
rename from third_party/s2n-bignum/arm/p521/p521_jdouble.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble.S
index 73afe4ffbd5..aa441a27ca4 100644
--- a/third_party/s2n-bignum/arm/p521/p521_jdouble.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble.S
@@ -65,353 +65,353 @@
 // Call local code very close to bignum_mul_p521 and bignum_sqr_p521.
 
 #define mul_p521(P0,P1,P2)                      \
-        add     x0, P0;                         \
-        add     x1, P1;                         \
-        add     x2, P2;                         \
-        bl      local_mul_p521
+        add     x0, P0 __LF                        \
+        add     x1, P1 __LF                        \
+        add     x2, P2 __LF                        \
+        bl      p521_jdouble_local_mul_p521
 
 // Call local code equivalent to bignum_sqr_p521
 
 #define sqr_p521(P0,P1)                         \
-        add     x0, P0;                         \
-        add     x1, P1;                         \
-        bl      local_sqr_p521
+        add     x0, P0 __LF                        \
+        add     x1, P1 __LF                        \
+        bl      p521_jdouble_local_sqr_p521
 
 // Corresponds exactly to bignum_add_p521
 
 #define add_p521(P0,P1,P2)                      \
-        cmp     xzr, xzr;                       \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        adcs    x5, x5, x4;                     \
-        adcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        adcs    x7, x7, x4;                     \
-        adcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        adcs    x9, x9, x4;                     \
-        adcs    x10, x10, x3;                   \
-        ldp     x11, x12, [P1+48];              \
-        ldp     x4, x3, [P2+48];                \
-        adcs    x11, x11, x4;                   \
-        adcs    x12, x12, x3;                   \
-        ldr     x13, [P1+64];                   \
-        ldr     x4, [P2+64];                    \
-        adc     x13, x13, x4;                   \
-        subs    x4, x13, #512;                  \
-        csetm   x4, hs;                         \
-        sbcs    x5, x5, xzr;                    \
-        and     x4, x4, #0x200;                 \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbc     x13, x13, x4;                   \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
+        cmp     xzr, xzr __LF                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        adcs    x5, x5, x4 __LF                    \
+        adcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x3 __LF                  \
+        ldp     x11, x12, [P1+48] __LF             \
+        ldp     x4, x3, [P2+48] __LF               \
+        adcs    x11, x11, x4 __LF                  \
+        adcs    x12, x12, x3 __LF                  \
+        ldr     x13, [P1+64] __LF                  \
+        ldr     x4, [P2+64] __LF                   \
+        adc     x13, x13, x4 __LF                  \
+        subs    x4, x13, #512 __LF                 \
+        csetm   x4, hs __LF                        \
+        sbcs    x5, x5, xzr __LF                   \
+        and     x4, x4, #0x200 __LF                \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, x4 __LF                  \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
         str     x13, [P0+64]
 
 // Corresponds exactly to bignum_sub_p521
 
 #define sub_p521(P0,P1,P2)                      \
-        ldp     x5, x6, [P1];                   \
-        ldp     x4, x3, [P2];                   \
-        subs    x5, x5, x4;                     \
-        sbcs    x6, x6, x3;                     \
-        ldp     x7, x8, [P1+16];                \
-        ldp     x4, x3, [P2+16];                \
-        sbcs    x7, x7, x4;                     \
-        sbcs    x8, x8, x3;                     \
-        ldp     x9, x10, [P1+32];               \
-        ldp     x4, x3, [P2+32];                \
-        sbcs    x9, x9, x4;                     \
-        sbcs    x10, x10, x3;                   \
-        ldp     x11, x12, [P1+48];              \
-        ldp     x4, x3, [P2+48];                \
-        sbcs    x11, x11, x4;                   \
-        sbcs    x12, x12, x3;                   \
-        ldr     x13, [P1+64];                   \
-        ldr     x4, [P2+64];                    \
-        sbcs    x13, x13, x4;                   \
-        sbcs    x5, x5, xzr;                    \
-        sbcs    x6, x6, xzr;                    \
-        sbcs    x7, x7, xzr;                    \
-        sbcs    x8, x8, xzr;                    \
-        sbcs    x9, x9, xzr;                    \
-        sbcs    x10, x10, xzr;                  \
-        sbcs    x11, x11, xzr;                  \
-        sbcs    x12, x12, xzr;                  \
-        sbcs    x13, x13, xzr;                  \
-        and     x13, x13, #0x1ff;               \
-        stp     x5, x6, [P0];                   \
-        stp     x7, x8, [P0+16];                \
-        stp     x9, x10, [P0+32];               \
-        stp     x11, x12, [P0+48];              \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        ldp     x11, x12, [P1+48] __LF             \
+        ldp     x4, x3, [P2+48] __LF               \
+        sbcs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        ldr     x13, [P1+64] __LF                  \
+        ldr     x4, [P2+64] __LF                   \
+        sbcs    x13, x13, x4 __LF                  \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        and     x13, x13, #0x1ff __LF              \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
         str     x13, [P0+64]
 
 // P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2)
 
 #define cmsub_p521(P0,C,P1,D,P2)                \
-        ldp     x6, x7, [P1];                   \
-        mov     x1, #(C);                       \
-        mul     x3, x1, x6;                     \
-        mul     x4, x1, x7;                     \
-        umulh   x6, x1, x6;                     \
-        adds    x4, x4, x6;                     \
-        umulh   x7, x1, x7;                     \
-        ldp     x8, x9, [P1+16];                \
-        mul     x5, x1, x8;                     \
-        mul     x6, x1, x9;                     \
-        umulh   x8, x1, x8;                     \
-        adcs    x5, x5, x7;                     \
-        umulh   x9, x1, x9;                     \
-        adcs    x6, x6, x8;                     \
-        ldp     x10, x11, [P1+32];              \
-        mul     x7, x1, x10;                    \
-        mul     x8, x1, x11;                    \
-        umulh   x10, x1, x10;                   \
-        adcs    x7, x7, x9;                     \
-        umulh   x11, x1, x11;                   \
-        adcs    x8, x8, x10;                    \
-        ldp     x12, x13, [P1+48];              \
-        mul     x9, x1, x12;                    \
-        mul     x10, x1, x13;                   \
-        umulh   x12, x1, x12;                   \
-        adcs    x9, x9, x11;                    \
-        umulh   x13, x1, x13;                   \
-        adcs    x10, x10, x12;                  \
-        ldr     x14, [P1+64];                   \
-        mul     x11, x1, x14;                   \
-        adc     x11, x11, x13;                  \
-        mov     x1, #(D);                       \
-        ldp     x20, x21, [P2];                 \
-        mvn     x20, x20;                       \
-        mul     x0, x1, x20;                    \
-        umulh   x20, x1, x20;                   \
-        adds    x3, x3, x0;                     \
-        mvn     x21, x21;                       \
-        mul     x0, x1, x21;                    \
-        umulh   x21, x1, x21;                   \
-        adcs    x4, x4, x0;                     \
-        ldp     x22, x23, [P2+16];              \
-        mvn     x22, x22;                       \
-        mul     x0, x1, x22;                    \
-        umulh   x22, x1, x22;                   \
-        adcs    x5, x5, x0;                     \
-        mvn     x23, x23;                       \
-        mul     x0, x1, x23;                    \
-        umulh   x23, x1, x23;                   \
-        adcs    x6, x6, x0;                     \
-        ldp     x17, x19, [P2+32];              \
-        mvn     x17, x17;                       \
-        mul     x0, x1, x17;                    \
-        umulh   x17, x1, x17;                   \
-        adcs    x7, x7, x0;                     \
-        mvn     x19, x19;                       \
-        mul     x0, x1, x19;                    \
-        umulh   x19, x1, x19;                   \
-        adcs    x8, x8, x0;                     \
-        ldp     x2, x16, [P2+48];               \
-        mvn     x2, x2;                         \
-        mul     x0, x1, x2;                     \
-        umulh   x2, x1, x2;                     \
-        adcs    x9, x9, x0;                     \
-        mvn     x16, x16;                       \
-        mul     x0, x1, x16;                    \
-        umulh   x16, x1, x16;                   \
-        adcs    x10, x10, x0;                   \
-        ldr     x0, [P2+64];                    \
-        eor     x0, x0, #0x1ff;                 \
-        mul     x0, x1, x0;                     \
-        adc     x11, x11, x0;                   \
-        adds    x4, x4, x20;                    \
-        adcs    x5, x5, x21;                    \
-        and     x15, x4, x5;                    \
-        adcs    x6, x6, x22;                    \
-        and     x15, x15, x6;                   \
-        adcs    x7, x7, x23;                    \
-        and     x15, x15, x7;                   \
-        adcs    x8, x8, x17;                    \
-        and     x15, x15, x8;                   \
-        adcs    x9, x9, x19;                    \
-        and     x15, x15, x9;                   \
-        adcs    x10, x10, x2;                   \
-        and     x15, x15, x10;                  \
-        adc     x11, x11, x16;                  \
-        lsr     x12, x11, #9;                   \
-        orr     x11, x11, #0xfffffffffffffe00;  \
-        cmp     xzr, xzr;                       \
-        adcs    xzr, x3, x12;                   \
-        adcs    xzr, x15, xzr;                  \
-        adcs    xzr, x11, xzr;                  \
-        adcs    x3, x3, x12;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adcs    x6, x6, xzr;                    \
-        adcs    x7, x7, xzr;                    \
-        adcs    x8, x8, xzr;                    \
-        adcs    x9, x9, xzr;                    \
-        adcs    x10, x10, xzr;                  \
-        adc     x11, x11, xzr;                  \
-        and     x11, x11, #0x1ff;               \
-        stp     x3, x4, [P0];                   \
-        stp     x5, x6, [P0+16];                \
-        stp     x7, x8, [P0+32];                \
-        stp     x9, x10, [P0+48];               \
+        ldp     x6, x7, [P1] __LF                  \
+        mov     x1, #(C) __LF                      \
+        mul     x3, x1, x6 __LF                    \
+        mul     x4, x1, x7 __LF                    \
+        umulh   x6, x1, x6 __LF                    \
+        adds    x4, x4, x6 __LF                    \
+        umulh   x7, x1, x7 __LF                    \
+        ldp     x8, x9, [P1+16] __LF               \
+        mul     x5, x1, x8 __LF                    \
+        mul     x6, x1, x9 __LF                    \
+        umulh   x8, x1, x8 __LF                    \
+        adcs    x5, x5, x7 __LF                    \
+        umulh   x9, x1, x9 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        ldp     x10, x11, [P1+32] __LF             \
+        mul     x7, x1, x10 __LF                   \
+        mul     x8, x1, x11 __LF                   \
+        umulh   x10, x1, x10 __LF                  \
+        adcs    x7, x7, x9 __LF                    \
+        umulh   x11, x1, x11 __LF                  \
+        adcs    x8, x8, x10 __LF                   \
+        ldp     x12, x13, [P1+48] __LF             \
+        mul     x9, x1, x12 __LF                   \
+        mul     x10, x1, x13 __LF                  \
+        umulh   x12, x1, x12 __LF                  \
+        adcs    x9, x9, x11 __LF                   \
+        umulh   x13, x1, x13 __LF                  \
+        adcs    x10, x10, x12 __LF                 \
+        ldr     x14, [P1+64] __LF                  \
+        mul     x11, x1, x14 __LF                  \
+        adc     x11, x11, x13 __LF                 \
+        mov     x1, #(D) __LF                      \
+        ldp     x20, x21, [P2] __LF                \
+        mvn     x20, x20 __LF                      \
+        mul     x0, x1, x20 __LF                   \
+        umulh   x20, x1, x20 __LF                  \
+        adds    x3, x3, x0 __LF                    \
+        mvn     x21, x21 __LF                      \
+        mul     x0, x1, x21 __LF                   \
+        umulh   x21, x1, x21 __LF                  \
+        adcs    x4, x4, x0 __LF                    \
+        ldp     x22, x23, [P2+16] __LF             \
+        mvn     x22, x22 __LF                      \
+        mul     x0, x1, x22 __LF                   \
+        umulh   x22, x1, x22 __LF                  \
+        adcs    x5, x5, x0 __LF                    \
+        mvn     x23, x23 __LF                      \
+        mul     x0, x1, x23 __LF                   \
+        umulh   x23, x1, x23 __LF                  \
+        adcs    x6, x6, x0 __LF                    \
+        ldp     x17, x19, [P2+32] __LF             \
+        mvn     x17, x17 __LF                      \
+        mul     x0, x1, x17 __LF                   \
+        umulh   x17, x1, x17 __LF                  \
+        adcs    x7, x7, x0 __LF                    \
+        mvn     x19, x19 __LF                      \
+        mul     x0, x1, x19 __LF                   \
+        umulh   x19, x1, x19 __LF                  \
+        adcs    x8, x8, x0 __LF                    \
+        ldp     x2, x16, [P2+48] __LF              \
+        mvn     x2, x2 __LF                        \
+        mul     x0, x1, x2 __LF                    \
+        umulh   x2, x1, x2 __LF                    \
+        adcs    x9, x9, x0 __LF                    \
+        mvn     x16, x16 __LF                      \
+        mul     x0, x1, x16 __LF                   \
+        umulh   x16, x1, x16 __LF                  \
+        adcs    x10, x10, x0 __LF                  \
+        ldr     x0, [P2+64] __LF                   \
+        eor     x0, x0, #0x1ff __LF                \
+        mul     x0, x1, x0 __LF                    \
+        adc     x11, x11, x0 __LF                  \
+        adds    x4, x4, x20 __LF                   \
+        adcs    x5, x5, x21 __LF                   \
+        and     x15, x4, x5 __LF                   \
+        adcs    x6, x6, x22 __LF                   \
+        and     x15, x15, x6 __LF                  \
+        adcs    x7, x7, x23 __LF                   \
+        and     x15, x15, x7 __LF                  \
+        adcs    x8, x8, x17 __LF                   \
+        and     x15, x15, x8 __LF                  \
+        adcs    x9, x9, x19 __LF                   \
+        and     x15, x15, x9 __LF                  \
+        adcs    x10, x10, x2 __LF                  \
+        and     x15, x15, x10 __LF                 \
+        adc     x11, x11, x16 __LF                 \
+        lsr     x12, x11, #9 __LF                  \
+        orr     x11, x11, #0xfffffffffffffe00 __LF \
+        cmp     xzr, xzr __LF                      \
+        adcs    xzr, x3, x12 __LF                  \
+        adcs    xzr, x15, xzr __LF                 \
+        adcs    xzr, x11, xzr __LF                 \
+        adcs    x3, x3, x12 __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, xzr __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        adcs    x8, x8, xzr __LF                   \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        and     x11, x11, #0x1ff __LF              \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16] __LF               \
+        stp     x7, x8, [P0+32] __LF               \
+        stp     x9, x10, [P0+48] __LF              \
         str     x11, [P0+64]
 
 // P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2)
 
 #define cmsub38_p521(P0,P1,P2)                  \
-        ldp     x6, x7, [P1];                   \
-        lsl     x3, x6, #1;                     \
-        adds    x3, x3, x6;                     \
-        extr    x4, x7, x6, #63;                \
-        adcs    x4, x4, x7;                     \
-        ldp     x8, x9, [P1+16];                \
-        extr    x5, x8, x7, #63;                \
-        adcs    x5, x5, x8;                     \
-        extr    x6, x9, x8, #63;                \
-        adcs    x6, x6, x9;                     \
-        ldp     x10, x11, [P1+32];              \
-        extr    x7, x10, x9, #63;               \
-        adcs    x7, x7, x10;                    \
-        extr    x8, x11, x10, #63;              \
-        adcs    x8, x8, x11;                    \
-        ldp     x12, x13, [P1+48];              \
-        extr    x9, x12, x11, #63;              \
-        adcs    x9, x9, x12;                    \
-        extr    x10, x13, x12, #63;             \
-        adcs    x10, x10, x13;                  \
-        ldr     x14, [P1+64];                   \
-        extr    x11, x14, x13, #63;             \
-        adc     x11, x11, x14;                  \
-        ldp     x20, x21, [P2];                 \
-        mvn     x20, x20;                       \
-        lsl     x0, x20, #3;                    \
-        adds    x3, x3, x0;                     \
-        mvn     x21, x21;                       \
-        extr    x0, x21, x20, #61;              \
-        adcs    x4, x4, x0;                     \
-        ldp     x22, x23, [P2+16];              \
-        mvn     x22, x22;                       \
-        extr    x0, x22, x21, #61;              \
-        adcs    x5, x5, x0;                     \
-        and     x15, x4, x5;                    \
-        mvn     x23, x23;                       \
-        extr    x0, x23, x22, #61;              \
-        adcs    x6, x6, x0;                     \
-        and     x15, x15, x6;                   \
-        ldp     x20, x21, [P2+32];              \
-        mvn     x20, x20;                       \
-        extr    x0, x20, x23, #61;              \
-        adcs    x7, x7, x0;                     \
-        and     x15, x15, x7;                   \
-        mvn     x21, x21;                       \
-        extr    x0, x21, x20, #61;              \
-        adcs    x8, x8, x0;                     \
-        and     x15, x15, x8;                   \
-        ldp     x22, x23, [P2+48];              \
-        mvn     x22, x22;                       \
-        extr    x0, x22, x21, #61;              \
-        adcs    x9, x9, x0;                     \
-        and     x15, x15, x9;                   \
-        mvn     x23, x23;                       \
-        extr    x0, x23, x22, #61;              \
-        adcs    x10, x10, x0;                   \
-        and     x15, x15, x10;                  \
-        ldr     x0, [P2+64];                    \
-        eor     x0, x0, #0x1ff;                 \
-        extr    x0, x0, x23, #61;               \
-        adc     x11, x11, x0;                   \
-        lsr     x12, x11, #9;                   \
-        orr     x11, x11, #0xfffffffffffffe00;  \
-        cmp     xzr, xzr;                       \
-        adcs    xzr, x3, x12;                   \
-        adcs    xzr, x15, xzr;                  \
-        adcs    xzr, x11, xzr;                  \
-        adcs    x3, x3, x12;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adcs    x6, x6, xzr;                    \
-        adcs    x7, x7, xzr;                    \
-        adcs    x8, x8, xzr;                    \
-        adcs    x9, x9, xzr;                    \
-        adcs    x10, x10, xzr;                  \
-        adc     x11, x11, xzr;                  \
-        and     x11, x11, #0x1ff;               \
-        stp     x3, x4, [P0];                   \
-        stp     x5, x6, [P0+16];                \
-        stp     x7, x8, [P0+32];                \
-        stp     x9, x10, [P0+48];               \
+        ldp     x6, x7, [P1] __LF                  \
+        lsl     x3, x6, #1 __LF                    \
+        adds    x3, x3, x6 __LF                    \
+        extr    x4, x7, x6, #63 __LF               \
+        adcs    x4, x4, x7 __LF                    \
+        ldp     x8, x9, [P1+16] __LF               \
+        extr    x5, x8, x7, #63 __LF               \
+        adcs    x5, x5, x8 __LF                    \
+        extr    x6, x9, x8, #63 __LF               \
+        adcs    x6, x6, x9 __LF                    \
+        ldp     x10, x11, [P1+32] __LF             \
+        extr    x7, x10, x9, #63 __LF              \
+        adcs    x7, x7, x10 __LF                   \
+        extr    x8, x11, x10, #63 __LF             \
+        adcs    x8, x8, x11 __LF                   \
+        ldp     x12, x13, [P1+48] __LF             \
+        extr    x9, x12, x11, #63 __LF             \
+        adcs    x9, x9, x12 __LF                   \
+        extr    x10, x13, x12, #63 __LF            \
+        adcs    x10, x10, x13 __LF                 \
+        ldr     x14, [P1+64] __LF                  \
+        extr    x11, x14, x13, #63 __LF            \
+        adc     x11, x11, x14 __LF                 \
+        ldp     x20, x21, [P2] __LF                \
+        mvn     x20, x20 __LF                      \
+        lsl     x0, x20, #3 __LF                   \
+        adds    x3, x3, x0 __LF                    \
+        mvn     x21, x21 __LF                      \
+        extr    x0, x21, x20, #61 __LF             \
+        adcs    x4, x4, x0 __LF                    \
+        ldp     x22, x23, [P2+16] __LF             \
+        mvn     x22, x22 __LF                      \
+        extr    x0, x22, x21, #61 __LF             \
+        adcs    x5, x5, x0 __LF                    \
+        and     x15, x4, x5 __LF                   \
+        mvn     x23, x23 __LF                      \
+        extr    x0, x23, x22, #61 __LF             \
+        adcs    x6, x6, x0 __LF                    \
+        and     x15, x15, x6 __LF                  \
+        ldp     x20, x21, [P2+32] __LF             \
+        mvn     x20, x20 __LF                      \
+        extr    x0, x20, x23, #61 __LF             \
+        adcs    x7, x7, x0 __LF                    \
+        and     x15, x15, x7 __LF                  \
+        mvn     x21, x21 __LF                      \
+        extr    x0, x21, x20, #61 __LF             \
+        adcs    x8, x8, x0 __LF                    \
+        and     x15, x15, x8 __LF                  \
+        ldp     x22, x23, [P2+48] __LF             \
+        mvn     x22, x22 __LF                      \
+        extr    x0, x22, x21, #61 __LF             \
+        adcs    x9, x9, x0 __LF                    \
+        and     x15, x15, x9 __LF                  \
+        mvn     x23, x23 __LF                      \
+        extr    x0, x23, x22, #61 __LF             \
+        adcs    x10, x10, x0 __LF                  \
+        and     x15, x15, x10 __LF                 \
+        ldr     x0, [P2+64] __LF                   \
+        eor     x0, x0, #0x1ff __LF                \
+        extr    x0, x0, x23, #61 __LF              \
+        adc     x11, x11, x0 __LF                  \
+        lsr     x12, x11, #9 __LF                  \
+        orr     x11, x11, #0xfffffffffffffe00 __LF \
+        cmp     xzr, xzr __LF                      \
+        adcs    xzr, x3, x12 __LF                  \
+        adcs    xzr, x15, xzr __LF                 \
+        adcs    xzr, x11, xzr __LF                 \
+        adcs    x3, x3, x12 __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, xzr __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        adcs    x8, x8, xzr __LF                   \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        and     x11, x11, #0x1ff __LF              \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16] __LF               \
+        stp     x7, x8, [P0+32] __LF               \
+        stp     x9, x10, [P0+48] __LF              \
         str     x11, [P0+64]
 
 // P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2)
 
 #define cmsub41_p521(P0,P1,P2)                  \
-        ldp     x6, x7, [P1];                   \
-        lsl     x3, x6, #2;                     \
-        extr    x4, x7, x6, #62;                \
-        ldp     x8, x9, [P1+16];                \
-        extr    x5, x8, x7, #62;                \
-        extr    x6, x9, x8, #62;                \
-        ldp     x10, x11, [P1+32];              \
-        extr    x7, x10, x9, #62;               \
-        extr    x8, x11, x10, #62;              \
-        ldp     x12, x13, [P1+48];              \
-        extr    x9, x12, x11, #62;              \
-        extr    x10, x13, x12, #62;             \
-        ldr     x14, [P1+64];                   \
-        extr    x11, x14, x13, #62;             \
-        ldp     x0, x1, [P2];                   \
-        mvn     x0, x0;                         \
-        adds    x3, x3, x0;                     \
-        sbcs    x4, x4, x1;                     \
-        ldp     x0, x1, [P2+16];                \
-        sbcs    x5, x5, x0;                     \
-        and     x15, x4, x5;                    \
-        sbcs    x6, x6, x1;                     \
-        and     x15, x15, x6;                   \
-        ldp     x0, x1, [P2+32];                \
-        sbcs    x7, x7, x0;                     \
-        and     x15, x15, x7;                   \
-        sbcs    x8, x8, x1;                     \
-        and     x15, x15, x8;                   \
-        ldp     x0, x1, [P2+48];                \
-        sbcs    x9, x9, x0;                     \
-        and     x15, x15, x9;                   \
-        sbcs    x10, x10, x1;                   \
-        and     x15, x15, x10;                  \
-        ldr     x0, [P2+64];                    \
-        eor     x0, x0, #0x1ff;                 \
-        adc     x11, x11, x0;                   \
-        lsr     x12, x11, #9;                   \
-        orr     x11, x11, #0xfffffffffffffe00;  \
-        cmp     xzr, xzr;                       \
-        adcs    xzr, x3, x12;                   \
-        adcs    xzr, x15, xzr;                  \
-        adcs    xzr, x11, xzr;                  \
-        adcs    x3, x3, x12;                    \
-        adcs    x4, x4, xzr;                    \
-        adcs    x5, x5, xzr;                    \
-        adcs    x6, x6, xzr;                    \
-        adcs    x7, x7, xzr;                    \
-        adcs    x8, x8, xzr;                    \
-        adcs    x9, x9, xzr;                    \
-        adcs    x10, x10, xzr;                  \
-        adc     x11, x11, xzr;                  \
-        and     x11, x11, #0x1ff;               \
-        stp     x3, x4, [P0];                   \
-        stp     x5, x6, [P0+16];                \
-        stp     x7, x8, [P0+32];                \
-        stp     x9, x10, [P0+48];               \
+        ldp     x6, x7, [P1] __LF                  \
+        lsl     x3, x6, #2 __LF                    \
+        extr    x4, x7, x6, #62 __LF               \
+        ldp     x8, x9, [P1+16] __LF               \
+        extr    x5, x8, x7, #62 __LF               \
+        extr    x6, x9, x8, #62 __LF               \
+        ldp     x10, x11, [P1+32] __LF             \
+        extr    x7, x10, x9, #62 __LF              \
+        extr    x8, x11, x10, #62 __LF             \
+        ldp     x12, x13, [P1+48] __LF             \
+        extr    x9, x12, x11, #62 __LF             \
+        extr    x10, x13, x12, #62 __LF            \
+        ldr     x14, [P1+64] __LF                  \
+        extr    x11, x14, x13, #62 __LF            \
+        ldp     x0, x1, [P2] __LF                  \
+        mvn     x0, x0 __LF                        \
+        adds    x3, x3, x0 __LF                    \
+        sbcs    x4, x4, x1 __LF                    \
+        ldp     x0, x1, [P2+16] __LF               \
+        sbcs    x5, x5, x0 __LF                    \
+        and     x15, x4, x5 __LF                   \
+        sbcs    x6, x6, x1 __LF                    \
+        and     x15, x15, x6 __LF                  \
+        ldp     x0, x1, [P2+32] __LF               \
+        sbcs    x7, x7, x0 __LF                    \
+        and     x15, x15, x7 __LF                  \
+        sbcs    x8, x8, x1 __LF                    \
+        and     x15, x15, x8 __LF                  \
+        ldp     x0, x1, [P2+48] __LF               \
+        sbcs    x9, x9, x0 __LF                    \
+        and     x15, x15, x9 __LF                  \
+        sbcs    x10, x10, x1 __LF                  \
+        and     x15, x15, x10 __LF                 \
+        ldr     x0, [P2+64] __LF                   \
+        eor     x0, x0, #0x1ff __LF                \
+        adc     x11, x11, x0 __LF                  \
+        lsr     x12, x11, #9 __LF                  \
+        orr     x11, x11, #0xfffffffffffffe00 __LF \
+        cmp     xzr, xzr __LF                      \
+        adcs    xzr, x3, x12 __LF                  \
+        adcs    xzr, x15, xzr __LF                 \
+        adcs    xzr, x11, xzr __LF                 \
+        adcs    x3, x3, x12 __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, xzr __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        adcs    x8, x8, xzr __LF                   \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        and     x11, x11, #0x1ff __LF              \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16] __LF               \
+        stp     x7, x8, [P0+32] __LF               \
+        stp     x9, x10, [P0+48] __LF              \
         str     x11, [P0+64]
 
 S2N_BN_SYMBOL(p521_jdouble):
@@ -494,9 +494,9 @@ S2N_BN_SYMBOL(p521_jdouble):
         ret
 
 // Local versions of the two "big" field operations, identical to
-// bignum_mul_p521_neon and bignum_sqr_p521_neon.
+// bignum_mul_p521 and bignum_sqr_p521.
 
-local_mul_p521:
+p521_jdouble_local_mul_p521:
         stp     x19, x20, [sp, #-16]!
         stp     x21, x22, [sp, #-16]!
         stp     x23, x24, [sp, #-16]!
@@ -1173,7 +1173,7 @@ local_mul_p521:
         ldp     x19, x20, [sp], #16
         ret
 
-local_sqr_p521:
+p521_jdouble_local_sqr_p521:
         stp     x19, x20, [sp, #-16]!
         stp     x21, x22, [sp, #-16]!
         stp     x23, x24, [sp, #-16]!
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble_alt.S
new file mode 100644
index 00000000000..691e62bd0eb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble_alt.S
@@ -0,0 +1,1458 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on NIST curve P-521 in Jacobian coordinates
+//
+//    extern void p521_jdouble_alt
+//      (uint64_t p3[static 27],uint64_t p1[static 27]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input point are fully
+// reduced mod p_521 and that the z coordinate is not zero.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 72
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x26
+#define input_x x27
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries
+
+#define z2 sp, #(NUMSIZE*0)
+#define y2 sp, #(NUMSIZE*1)
+#define x2p sp, #(NUMSIZE*2)
+#define xy2 sp, #(NUMSIZE*3)
+
+#define y4 sp, #(NUMSIZE*4)
+#define t2 sp, #(NUMSIZE*4)
+
+#define dx2 sp, #(NUMSIZE*5)
+#define t1 sp, #(NUMSIZE*5)
+
+#define d sp, #(NUMSIZE*6)
+#define x4p sp, #(NUMSIZE*6)
+
+// NUMSIZE*7 is not 16-aligned so we round it up
+
+#define NSPACE (NUMSIZE*7+8)
+
+// Corresponds exactly to bignum_mul_p521_alt
+
+#define mul_p521(P0,P1,P2)                      \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x15, x3, x5 __LF                   \
+        umulh   x16, x3, x5 __LF                   \
+        mul     x14, x3, x6 __LF                   \
+        umulh   x17, x3, x6 __LF                   \
+        adds    x16, x16, x14 __LF                 \
+        ldp     x7, x8, [P2+16] __LF               \
+        mul     x14, x3, x7 __LF                   \
+        umulh   x19, x3, x7 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        umulh   x20, x3, x8 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        ldp     x9, x10, [P2+32] __LF              \
+        mul     x14, x3, x9 __LF                   \
+        umulh   x21, x3, x9 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        umulh   x22, x3, x10 __LF                  \
+        adcs    x21, x21, x14 __LF                 \
+        ldp     x11, x12, [P2+48] __LF             \
+        mul     x14, x3, x11 __LF                  \
+        umulh   x23, x3, x11 __LF                  \
+        adcs    x22, x22, x14 __LF                 \
+        ldr     x13, [P2+64] __LF                  \
+        mul     x14, x3, x12 __LF                  \
+        umulh   x24, x3, x12 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        umulh   x1, x3, x13 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        mul     x14, x4, x5 __LF                   \
+        adds    x16, x16, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        cset    x0, hs __LF                        \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x0, x0, x14 __LF                   \
+        stp     x15, x16, [P0] __LF                \
+        ldp     x3, x4, [P1+16] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x17, x17, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        cset    x15, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x15, x15, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x19, x19, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        cset    x16, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x16, x16, x14 __LF                 \
+        stp     x17, x19, [P0+16] __LF             \
+        ldp     x3, x4, [P1+32] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x20, x20, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        cset    x17, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x21, x21, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x17, x17, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x21, x21, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        cset    x19, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x19, x19, x14 __LF                 \
+        stp     x20, x21, [P0+32] __LF             \
+        ldp     x3, x4, [P1+48] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x22, x22, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x20, x20, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x23, x23, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        cset    x21, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        stp     x22, x23, [P0+48] __LF             \
+        ldr     x3, [P1+64] __LF                   \
+        mul     x14, x3, x5 __LF                   \
+        adds    x24, x24, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        cmp     xzr, xzr __LF                      \
+        ldp     x5, x6, [P0] __LF                  \
+        extr    x14, x1, x24, #9 __LF              \
+        adcs    x5, x5, x14 __LF                   \
+        extr    x14, x0, x1, #9 __LF               \
+        adcs    x6, x6, x14 __LF                   \
+        ldp     x7, x8, [P0+16] __LF               \
+        extr    x14, x15, x0, #9 __LF              \
+        adcs    x7, x7, x14 __LF                   \
+        extr    x14, x16, x15, #9 __LF             \
+        adcs    x8, x8, x14 __LF                   \
+        ldp     x9, x10, [P0+32] __LF              \
+        extr    x14, x17, x16, #9 __LF             \
+        adcs    x9, x9, x14 __LF                   \
+        extr    x14, x19, x17, #9 __LF             \
+        adcs    x10, x10, x14 __LF                 \
+        ldp     x11, x12, [P0+48] __LF             \
+        extr    x14, x20, x19, #9 __LF             \
+        adcs    x11, x11, x14 __LF                 \
+        extr    x14, x21, x20, #9 __LF             \
+        adcs    x12, x12, x14 __LF                 \
+        orr     x13, x24, #0xfffffffffffffe00 __LF \
+        lsr     x14, x21, #9 __LF                  \
+        adcs    x13, x13, x14 __LF                 \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        and     x13, x13, #0x1ff __LF              \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
+        str     x13, [P0+64]
+
+// Corresponds exactly to bignum_sqr_p521_alt
+
+#define sqr_p521(P0,P1)                         \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x11, x2, x3 __LF                   \
+        umulh   x12, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x10, x2, x4 __LF                   \
+        umulh   x13, x2, x4 __LF                   \
+        adds    x12, x12, x10 __LF                 \
+        ldp     x6, x7, [P1+32] __LF               \
+        mul     x10, x2, x5 __LF                   \
+        umulh   x14, x2, x5 __LF                   \
+        adcs    x13, x13, x10 __LF                 \
+        ldp     x8, x9, [P1+48] __LF               \
+        mul     x10, x2, x6 __LF                   \
+        umulh   x15, x2, x6 __LF                   \
+        adcs    x14, x14, x10 __LF                 \
+        mul     x10, x2, x7 __LF                   \
+        umulh   x16, x2, x7 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        mul     x10, x2, x8 __LF                   \
+        umulh   x17, x2, x8 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        mul     x10, x2, x9 __LF                   \
+        umulh   x19, x2, x9 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        adc     x19, x19, xzr __LF                 \
+        mul     x10, x3, x4 __LF                   \
+        adds    x13, x13, x10 __LF                 \
+        mul     x10, x3, x5 __LF                   \
+        adcs    x14, x14, x10 __LF                 \
+        mul     x10, x3, x6 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        mul     x10, x3, x7 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        mul     x10, x3, x8 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        mul     x10, x3, x9 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x10, x3, x4 __LF                   \
+        adds    x14, x14, x10 __LF                 \
+        umulh   x10, x3, x5 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        umulh   x10, x3, x6 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        umulh   x10, x3, x7 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        umulh   x10, x3, x8 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        umulh   x10, x3, x9 __LF                   \
+        adc     x20, x20, x10 __LF                 \
+        mul     x10, x6, x7 __LF                   \
+        umulh   x21, x6, x7 __LF                   \
+        adds    x20, x20, x10 __LF                 \
+        adc     x21, x21, xzr __LF                 \
+        mul     x10, x4, x5 __LF                   \
+        adds    x15, x15, x10 __LF                 \
+        mul     x10, x4, x6 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        mul     x10, x4, x7 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        mul     x10, x4, x8 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        mul     x10, x4, x9 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x6, x8 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        cset    x22, hs __LF                       \
+        umulh   x10, x4, x5 __LF                   \
+        adds    x16, x16, x10 __LF                 \
+        umulh   x10, x4, x6 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        umulh   x10, x4, x7 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        umulh   x10, x4, x8 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        umulh   x10, x4, x9 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x6, x8 __LF                   \
+        adc     x22, x22, x10 __LF                 \
+        mul     x10, x7, x8 __LF                   \
+        umulh   x23, x7, x8 __LF                   \
+        adds    x22, x22, x10 __LF                 \
+        adc     x23, x23, xzr __LF                 \
+        mul     x10, x5, x6 __LF                   \
+        adds    x17, x17, x10 __LF                 \
+        mul     x10, x5, x7 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        mul     x10, x5, x8 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x5, x9 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        mul     x10, x6, x9 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        mul     x10, x7, x9 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        cset    x24, hs __LF                       \
+        umulh   x10, x5, x6 __LF                   \
+        adds    x19, x19, x10 __LF                 \
+        umulh   x10, x5, x7 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        umulh   x10, x5, x8 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x5, x9 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        umulh   x10, x6, x9 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        umulh   x10, x7, x9 __LF                   \
+        adc     x24, x24, x10 __LF                 \
+        mul     x10, x8, x9 __LF                   \
+        umulh   x25, x8, x9 __LF                   \
+        adds    x24, x24, x10 __LF                 \
+        adc     x25, x25, xzr __LF                 \
+        adds    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adcs    x17, x17, x17 __LF                 \
+        adcs    x19, x19, x19 __LF                 \
+        adcs    x20, x20, x20 __LF                 \
+        adcs    x21, x21, x21 __LF                 \
+        adcs    x22, x22, x22 __LF                 \
+        adcs    x23, x23, x23 __LF                 \
+        adcs    x24, x24, x24 __LF                 \
+        adcs    x25, x25, x25 __LF                 \
+        cset    x0, hs __LF                        \
+        umulh   x10, x2, x2 __LF                   \
+        adds    x11, x11, x10 __LF                 \
+        mul     x10, x3, x3 __LF                   \
+        adcs    x12, x12, x10 __LF                 \
+        umulh   x10, x3, x3 __LF                   \
+        adcs    x13, x13, x10 __LF                 \
+        mul     x10, x4, x4 __LF                   \
+        adcs    x14, x14, x10 __LF                 \
+        umulh   x10, x4, x4 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        mul     x10, x5, x5 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        umulh   x10, x5, x5 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        mul     x10, x6, x6 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        umulh   x10, x6, x6 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x7, x7 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x7, x7 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        mul     x10, x8, x8 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        umulh   x10, x8, x8 __LF                   \
+        adcs    x24, x24, x10 __LF                 \
+        mul     x10, x9, x9 __LF                   \
+        adcs    x25, x25, x10 __LF                 \
+        umulh   x10, x9, x9 __LF                   \
+        adc     x0, x0, x10 __LF                   \
+        ldr     x1, [P1+64] __LF                   \
+        add     x1, x1, x1 __LF                    \
+        mul     x10, x1, x2 __LF                   \
+        adds    x19, x19, x10 __LF                 \
+        umulh   x10, x1, x2 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x1, x4 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x1, x4 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        mul     x10, x1, x6 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        umulh   x10, x1, x6 __LF                   \
+        adcs    x24, x24, x10 __LF                 \
+        mul     x10, x1, x8 __LF                   \
+        adcs    x25, x25, x10 __LF                 \
+        umulh   x10, x1, x8 __LF                   \
+        adcs    x0, x0, x10 __LF                   \
+        lsr     x4, x1, #1 __LF                    \
+        mul     x4, x4, x4 __LF                    \
+        adc     x4, x4, xzr __LF                   \
+        mul     x10, x1, x3 __LF                   \
+        adds    x20, x20, x10 __LF                 \
+        umulh   x10, x1, x3 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        mul     x10, x1, x5 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        umulh   x10, x1, x5 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        mul     x10, x1, x7 __LF                   \
+        adcs    x24, x24, x10 __LF                 \
+        umulh   x10, x1, x7 __LF                   \
+        adcs    x25, x25, x10 __LF                 \
+        mul     x10, x1, x9 __LF                   \
+        adcs    x0, x0, x10 __LF                   \
+        umulh   x10, x1, x9 __LF                   \
+        adc     x4, x4, x10 __LF                   \
+        mul     x2, x2, x2 __LF                    \
+        cmp     xzr, xzr __LF                      \
+        extr    x10, x20, x19, #9 __LF             \
+        adcs    x2, x2, x10 __LF                   \
+        extr    x10, x21, x20, #9 __LF             \
+        adcs    x11, x11, x10 __LF                 \
+        extr    x10, x22, x21, #9 __LF             \
+        adcs    x12, x12, x10 __LF                 \
+        extr    x10, x23, x22, #9 __LF             \
+        adcs    x13, x13, x10 __LF                 \
+        extr    x10, x24, x23, #9 __LF             \
+        adcs    x14, x14, x10 __LF                 \
+        extr    x10, x25, x24, #9 __LF             \
+        adcs    x15, x15, x10 __LF                 \
+        extr    x10, x0, x25, #9 __LF              \
+        adcs    x16, x16, x10 __LF                 \
+        extr    x10, x4, x0, #9 __LF               \
+        adcs    x17, x17, x10 __LF                 \
+        orr     x19, x19, #0xfffffffffffffe00 __LF \
+        lsr     x10, x4, #9 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        sbcs    x2, x2, xzr __LF                   \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbcs    x15, x15, xzr __LF                 \
+        sbcs    x16, x16, xzr __LF                 \
+        sbcs    x17, x17, xzr __LF                 \
+        sbc     x19, x19, xzr __LF                 \
+        and     x19, x19, #0x1ff __LF              \
+        stp     x2, x11, [P0] __LF                 \
+        stp     x12, x13, [P0+16] __LF             \
+        stp     x14, x15, [P0+32] __LF             \
+        stp     x16, x17, [P0+48] __LF             \
+        str     x19, [P0+64]
+
+// Corresponds exactly to bignum_add_p521
+
+#define add_p521(P0,P1,P2)                      \
+        cmp     xzr, xzr __LF                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        adcs    x5, x5, x4 __LF                    \
+        adcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        adcs    x7, x7, x4 __LF                    \
+        adcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        adcs    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x3 __LF                  \
+        ldp     x11, x12, [P1+48] __LF             \
+        ldp     x4, x3, [P2+48] __LF               \
+        adcs    x11, x11, x4 __LF                  \
+        adcs    x12, x12, x3 __LF                  \
+        ldr     x13, [P1+64] __LF                  \
+        ldr     x4, [P2+64] __LF                   \
+        adc     x13, x13, x4 __LF                  \
+        subs    x4, x13, #512 __LF                 \
+        csetm   x4, hs __LF                        \
+        sbcs    x5, x5, xzr __LF                   \
+        and     x4, x4, #0x200 __LF                \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, x4 __LF                  \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
+        str     x13, [P0+64]
+
+// Corresponds exactly to bignum_sub_p521
+
+#define sub_p521(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        ldp     x11, x12, [P1+48] __LF             \
+        ldp     x4, x3, [P2+48] __LF               \
+        sbcs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        ldr     x13, [P1+64] __LF                  \
+        ldr     x4, [P2+64] __LF                   \
+        sbcs    x13, x13, x4 __LF                  \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        and     x13, x13, #0x1ff __LF              \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
+        str     x13, [P0+64]
+
+// Weak multiplication not fully reducing
+
+#define weakmul_p521(P0,P1,P2)                  \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x15, x3, x5 __LF                   \
+        umulh   x16, x3, x5 __LF                   \
+        mul     x14, x3, x6 __LF                   \
+        umulh   x17, x3, x6 __LF                   \
+        adds    x16, x16, x14 __LF                 \
+        ldp     x7, x8, [P2+16] __LF               \
+        mul     x14, x3, x7 __LF                   \
+        umulh   x19, x3, x7 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        umulh   x20, x3, x8 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        ldp     x9, x10, [P2+32] __LF              \
+        mul     x14, x3, x9 __LF                   \
+        umulh   x21, x3, x9 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        umulh   x22, x3, x10 __LF                  \
+        adcs    x21, x21, x14 __LF                 \
+        ldp     x11, x12, [P2+48] __LF             \
+        mul     x14, x3, x11 __LF                  \
+        umulh   x23, x3, x11 __LF                  \
+        adcs    x22, x22, x14 __LF                 \
+        ldr     x13, [P2+64] __LF                  \
+        mul     x14, x3, x12 __LF                  \
+        umulh   x24, x3, x12 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        umulh   x1, x3, x13 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        mul     x14, x4, x5 __LF                   \
+        adds    x16, x16, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        cset    x0, hs __LF                        \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x0, x0, x14 __LF                   \
+        stp     x15, x16, [P0] __LF                \
+        ldp     x3, x4, [P1+16] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x17, x17, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        cset    x15, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x15, x15, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x19, x19, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        cset    x16, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x16, x16, x14 __LF                 \
+        stp     x17, x19, [P0+16] __LF             \
+        ldp     x3, x4, [P1+32] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x20, x20, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        cset    x17, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x21, x21, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x17, x17, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x21, x21, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        cset    x19, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x19, x19, x14 __LF                 \
+        stp     x20, x21, [P0+32] __LF             \
+        ldp     x3, x4, [P1+48] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x22, x22, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x20, x20, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x23, x23, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        cset    x21, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        stp     x22, x23, [P0+48] __LF             \
+        ldr     x3, [P1+64] __LF                   \
+        mul     x14, x3, x5 __LF                   \
+        adds    x24, x24, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        ldp     x5, x6, [P0] __LF                  \
+        extr    x14, x1, x24, #9 __LF              \
+        adds    x5, x5, x14 __LF                   \
+        extr    x14, x0, x1, #9 __LF               \
+        adcs    x6, x6, x14 __LF                   \
+        ldp     x7, x8, [P0+16] __LF               \
+        extr    x14, x15, x0, #9 __LF              \
+        adcs    x7, x7, x14 __LF                   \
+        extr    x14, x16, x15, #9 __LF             \
+        adcs    x8, x8, x14 __LF                   \
+        ldp     x9, x10, [P0+32] __LF              \
+        extr    x14, x17, x16, #9 __LF             \
+        adcs    x9, x9, x14 __LF                   \
+        extr    x14, x19, x17, #9 __LF             \
+        adcs    x10, x10, x14 __LF                 \
+        ldp     x11, x12, [P0+48] __LF             \
+        extr    x14, x20, x19, #9 __LF             \
+        adcs    x11, x11, x14 __LF                 \
+        extr    x14, x21, x20, #9 __LF             \
+        adcs    x12, x12, x14 __LF                 \
+        and     x13, x24, #0x1ff __LF              \
+        lsr     x14, x21, #9 __LF                  \
+        adc     x13, x13, x14 __LF                 \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
+        str     x13, [P0+64]
+
+// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2)
+
+#define cmsub_p521(P0,C,P1,D,P2)                \
+        ldp     x6, x7, [P1] __LF                  \
+        mov     x1, #(C) __LF                      \
+        mul     x3, x1, x6 __LF                    \
+        mul     x4, x1, x7 __LF                    \
+        umulh   x6, x1, x6 __LF                    \
+        adds    x4, x4, x6 __LF                    \
+        umulh   x7, x1, x7 __LF                    \
+        ldp     x8, x9, [P1+16] __LF               \
+        mul     x5, x1, x8 __LF                    \
+        mul     x6, x1, x9 __LF                    \
+        umulh   x8, x1, x8 __LF                    \
+        adcs    x5, x5, x7 __LF                    \
+        umulh   x9, x1, x9 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        ldp     x10, x11, [P1+32] __LF             \
+        mul     x7, x1, x10 __LF                   \
+        mul     x8, x1, x11 __LF                   \
+        umulh   x10, x1, x10 __LF                  \
+        adcs    x7, x7, x9 __LF                    \
+        umulh   x11, x1, x11 __LF                  \
+        adcs    x8, x8, x10 __LF                   \
+        ldp     x12, x13, [P1+48] __LF             \
+        mul     x9, x1, x12 __LF                   \
+        mul     x10, x1, x13 __LF                  \
+        umulh   x12, x1, x12 __LF                  \
+        adcs    x9, x9, x11 __LF                   \
+        umulh   x13, x1, x13 __LF                  \
+        adcs    x10, x10, x12 __LF                 \
+        ldr     x14, [P1+64] __LF                  \
+        mul     x11, x1, x14 __LF                  \
+        adc     x11, x11, x13 __LF                 \
+        mov     x1, #(D) __LF                      \
+        ldp     x20, x21, [P2] __LF                \
+        mvn     x20, x20 __LF                      \
+        mul     x0, x1, x20 __LF                   \
+        umulh   x20, x1, x20 __LF                  \
+        adds    x3, x3, x0 __LF                    \
+        mvn     x21, x21 __LF                      \
+        mul     x0, x1, x21 __LF                   \
+        umulh   x21, x1, x21 __LF                  \
+        adcs    x4, x4, x0 __LF                    \
+        ldp     x22, x23, [P2+16] __LF             \
+        mvn     x22, x22 __LF                      \
+        mul     x0, x1, x22 __LF                   \
+        umulh   x22, x1, x22 __LF                  \
+        adcs    x5, x5, x0 __LF                    \
+        mvn     x23, x23 __LF                      \
+        mul     x0, x1, x23 __LF                   \
+        umulh   x23, x1, x23 __LF                  \
+        adcs    x6, x6, x0 __LF                    \
+        ldp     x17, x19, [P2+32] __LF             \
+        mvn     x17, x17 __LF                      \
+        mul     x0, x1, x17 __LF                   \
+        umulh   x17, x1, x17 __LF                  \
+        adcs    x7, x7, x0 __LF                    \
+        mvn     x19, x19 __LF                      \
+        mul     x0, x1, x19 __LF                   \
+        umulh   x19, x1, x19 __LF                  \
+        adcs    x8, x8, x0 __LF                    \
+        ldp     x2, x16, [P2+48] __LF              \
+        mvn     x2, x2 __LF                        \
+        mul     x0, x1, x2 __LF                    \
+        umulh   x2, x1, x2 __LF                    \
+        adcs    x9, x9, x0 __LF                    \
+        mvn     x16, x16 __LF                      \
+        mul     x0, x1, x16 __LF                   \
+        umulh   x16, x1, x16 __LF                  \
+        adcs    x10, x10, x0 __LF                  \
+        ldr     x0, [P2+64] __LF                   \
+        eor     x0, x0, #0x1ff __LF                \
+        mul     x0, x1, x0 __LF                    \
+        adc     x11, x11, x0 __LF                  \
+        adds    x4, x4, x20 __LF                   \
+        adcs    x5, x5, x21 __LF                   \
+        and     x15, x4, x5 __LF                   \
+        adcs    x6, x6, x22 __LF                   \
+        and     x15, x15, x6 __LF                  \
+        adcs    x7, x7, x23 __LF                   \
+        and     x15, x15, x7 __LF                  \
+        adcs    x8, x8, x17 __LF                   \
+        and     x15, x15, x8 __LF                  \
+        adcs    x9, x9, x19 __LF                   \
+        and     x15, x15, x9 __LF                  \
+        adcs    x10, x10, x2 __LF                  \
+        and     x15, x15, x10 __LF                 \
+        adc     x11, x11, x16 __LF                 \
+        lsr     x12, x11, #9 __LF                  \
+        orr     x11, x11, #0xfffffffffffffe00 __LF \
+        cmp     xzr, xzr __LF                      \
+        adcs    xzr, x3, x12 __LF                  \
+        adcs    xzr, x15, xzr __LF                 \
+        adcs    xzr, x11, xzr __LF                 \
+        adcs    x3, x3, x12 __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, xzr __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        adcs    x8, x8, xzr __LF                   \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        and     x11, x11, #0x1ff __LF              \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16] __LF               \
+        stp     x7, x8, [P0+32] __LF               \
+        stp     x9, x10, [P0+48] __LF              \
+        str     x11, [P0+64]
+
+// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2)
+
+#define cmsub38_p521(P0,P1,P2)                  \
+        ldp     x6, x7, [P1] __LF                  \
+        lsl     x3, x6, #1 __LF                    \
+        adds    x3, x3, x6 __LF                    \
+        extr    x4, x7, x6, #63 __LF               \
+        adcs    x4, x4, x7 __LF                    \
+        ldp     x8, x9, [P1+16] __LF               \
+        extr    x5, x8, x7, #63 __LF               \
+        adcs    x5, x5, x8 __LF                    \
+        extr    x6, x9, x8, #63 __LF               \
+        adcs    x6, x6, x9 __LF                    \
+        ldp     x10, x11, [P1+32] __LF             \
+        extr    x7, x10, x9, #63 __LF              \
+        adcs    x7, x7, x10 __LF                   \
+        extr    x8, x11, x10, #63 __LF             \
+        adcs    x8, x8, x11 __LF                   \
+        ldp     x12, x13, [P1+48] __LF             \
+        extr    x9, x12, x11, #63 __LF             \
+        adcs    x9, x9, x12 __LF                   \
+        extr    x10, x13, x12, #63 __LF            \
+        adcs    x10, x10, x13 __LF                 \
+        ldr     x14, [P1+64] __LF                  \
+        extr    x11, x14, x13, #63 __LF            \
+        adc     x11, x11, x14 __LF                 \
+        ldp     x20, x21, [P2] __LF                \
+        mvn     x20, x20 __LF                      \
+        lsl     x0, x20, #3 __LF                   \
+        adds    x3, x3, x0 __LF                    \
+        mvn     x21, x21 __LF                      \
+        extr    x0, x21, x20, #61 __LF             \
+        adcs    x4, x4, x0 __LF                    \
+        ldp     x22, x23, [P2+16] __LF             \
+        mvn     x22, x22 __LF                      \
+        extr    x0, x22, x21, #61 __LF             \
+        adcs    x5, x5, x0 __LF                    \
+        and     x15, x4, x5 __LF                   \
+        mvn     x23, x23 __LF                      \
+        extr    x0, x23, x22, #61 __LF             \
+        adcs    x6, x6, x0 __LF                    \
+        and     x15, x15, x6 __LF                  \
+        ldp     x20, x21, [P2+32] __LF             \
+        mvn     x20, x20 __LF                      \
+        extr    x0, x20, x23, #61 __LF             \
+        adcs    x7, x7, x0 __LF                    \
+        and     x15, x15, x7 __LF                  \
+        mvn     x21, x21 __LF                      \
+        extr    x0, x21, x20, #61 __LF             \
+        adcs    x8, x8, x0 __LF                    \
+        and     x15, x15, x8 __LF                  \
+        ldp     x22, x23, [P2+48] __LF             \
+        mvn     x22, x22 __LF                      \
+        extr    x0, x22, x21, #61 __LF             \
+        adcs    x9, x9, x0 __LF                    \
+        and     x15, x15, x9 __LF                  \
+        mvn     x23, x23 __LF                      \
+        extr    x0, x23, x22, #61 __LF             \
+        adcs    x10, x10, x0 __LF                  \
+        and     x15, x15, x10 __LF                 \
+        ldr     x0, [P2+64] __LF                   \
+        eor     x0, x0, #0x1ff __LF                \
+        extr    x0, x0, x23, #61 __LF              \
+        adc     x11, x11, x0 __LF                  \
+        lsr     x12, x11, #9 __LF                  \
+        orr     x11, x11, #0xfffffffffffffe00 __LF \
+        cmp     xzr, xzr __LF                      \
+        adcs    xzr, x3, x12 __LF                  \
+        adcs    xzr, x15, xzr __LF                 \
+        adcs    xzr, x11, xzr __LF                 \
+        adcs    x3, x3, x12 __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, xzr __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        adcs    x8, x8, xzr __LF                   \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        and     x11, x11, #0x1ff __LF              \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16] __LF               \
+        stp     x7, x8, [P0+32] __LF               \
+        stp     x9, x10, [P0+48] __LF              \
+        str     x11, [P0+64]
+
+// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2)
+
+#define cmsub41_p521(P0,P1,P2)                  \
+        ldp     x6, x7, [P1] __LF                  \
+        lsl     x3, x6, #2 __LF                    \
+        extr    x4, x7, x6, #62 __LF               \
+        ldp     x8, x9, [P1+16] __LF               \
+        extr    x5, x8, x7, #62 __LF               \
+        extr    x6, x9, x8, #62 __LF               \
+        ldp     x10, x11, [P1+32] __LF             \
+        extr    x7, x10, x9, #62 __LF              \
+        extr    x8, x11, x10, #62 __LF             \
+        ldp     x12, x13, [P1+48] __LF             \
+        extr    x9, x12, x11, #62 __LF             \
+        extr    x10, x13, x12, #62 __LF            \
+        ldr     x14, [P1+64] __LF                  \
+        extr    x11, x14, x13, #62 __LF            \
+        ldp     x0, x1, [P2] __LF                  \
+        mvn     x0, x0 __LF                        \
+        adds    x3, x3, x0 __LF                    \
+        sbcs    x4, x4, x1 __LF                    \
+        ldp     x0, x1, [P2+16] __LF               \
+        sbcs    x5, x5, x0 __LF                    \
+        and     x15, x4, x5 __LF                   \
+        sbcs    x6, x6, x1 __LF                    \
+        and     x15, x15, x6 __LF                  \
+        ldp     x0, x1, [P2+32] __LF               \
+        sbcs    x7, x7, x0 __LF                    \
+        and     x15, x15, x7 __LF                  \
+        sbcs    x8, x8, x1 __LF                    \
+        and     x15, x15, x8 __LF                  \
+        ldp     x0, x1, [P2+48] __LF               \
+        sbcs    x9, x9, x0 __LF                    \
+        and     x15, x15, x9 __LF                  \
+        sbcs    x10, x10, x1 __LF                  \
+        and     x15, x15, x10 __LF                 \
+        ldr     x0, [P2+64] __LF                   \
+        eor     x0, x0, #0x1ff __LF                \
+        adc     x11, x11, x0 __LF                  \
+        lsr     x12, x11, #9 __LF                  \
+        orr     x11, x11, #0xfffffffffffffe00 __LF \
+        cmp     xzr, xzr __LF                      \
+        adcs    xzr, x3, x12 __LF                  \
+        adcs    xzr, x15, xzr __LF                 \
+        adcs    xzr, x11, xzr __LF                 \
+        adcs    x3, x3, x12 __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, xzr __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        adcs    x8, x8, xzr __LF                   \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        adc     x11, x11, xzr __LF                 \
+        and     x11, x11, #0x1ff __LF              \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16] __LF               \
+        stp     x7, x8, [P0+32] __LF               \
+        stp     x9, x10, [P0+48] __LF              \
+        str     x11, [P0+64]
+
+S2N_BN_SYMBOL(p521_jdouble_alt):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+        stp     x27, x28, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+
+// Main code, just a sequence of basic field operations
+
+// z2 = z^2
+// y2 = y^2
+
+        sqr_p521(z2,z_1)
+        sqr_p521(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        add_p521(t1,x_1,z2)
+        sub_p521(t2,x_1,z2)
+        mul_p521(x2p,t1,t2)
+
+// t1 = y + z
+// x4p = x2p^2
+// xy2 = x * y^2
+
+        add_p521(t1,y_1,z_1)
+        sqr_p521(x4p,x2p)
+        weakmul_p521(xy2,x_1,y2)
+
+// t2 = (y + z)^2
+
+        sqr_p521(t2,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_p521(d,12,xy2,9,x4p)
+        sub_p521(t1,t2,z2)
+
+// y4 = y^4
+
+        sqr_p521(y4,y2)
+
+// z_3' = 2 * y * z
+// dx2 = d * x2p
+
+        sub_p521(z_3,t1,y2)
+        weakmul_p521(dx2,d,x2p)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_p521(x_3,xy2,d)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_p521(y_3,dx2,y4)
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+
+        ldp     x27, x28, [sp], 16
+        ldp     x25, x26, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p521/p521_jmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd.S
similarity index 98%
rename from third_party/s2n-bignum/arm/p521/p521_jmixadd.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd.S
index b04e39327fc..2ee6433b043 100644
--- a/third_party/s2n-bignum/arm/p521/p521_jmixadd.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd.S
@@ -81,21 +81,21 @@
 // and bignum_sub_p521
 
 #define mul_p521(P0,P1,P2)                      \
-        add     x0, P0;                         \
-        add     x1, P1;                         \
-        add     x2, P2;                         \
-        bl      local_mul_p521
+        add     x0, P0 __LF                        \
+        add     x1, P1 __LF                        \
+        add     x2, P2 __LF                        \
+        bl      p521_jmixadd_local_mul_p521
 
 #define sqr_p521(P0,P1)                         \
-        add     x0, P0;                         \
-        add     x1, P1;                         \
-        bl      local_sqr_p521
+        add     x0, P0 __LF                        \
+        add     x1, P1 __LF                        \
+        bl      p521_jmixadd_local_sqr_p521
 
 #define sub_p521(P0,P1,P2)                      \
-        add     x0, P0;                         \
-        add     x1, P1;                         \
-        add     x2, P2;                         \
-        bl      local_sub_p521
+        add     x0, P0 __LF                        \
+        add     x1, P1 __LF                        \
+        add     x2, P2 __LF                        \
+        bl      p521_jmixadd_local_sub_p521
 
 S2N_BN_SYMBOL(p521_jmixadd):
 
@@ -258,7 +258,7 @@ S2N_BN_SYMBOL(p521_jmixadd):
 // local_mul_p521, using the tmp buffer as temporary storage and
 // avoiding x26.
 
-local_mul_p521:
+p521_jmixadd_local_mul_p521:
         ldp     x3, x4, [x1]
         ldp     x5, x6, [x1, #16]
         ldp     x7, x8, [x2]
@@ -885,7 +885,7 @@ local_mul_p521:
         str     x22, [x0, #64]
         ret
 
-local_sqr_p521:
+p521_jmixadd_local_sqr_p521:
         ldp     x2, x3, [x1]
         ldp     x4, x5, [x1, #16]
         ldp     x6, x7, [x1, #32]
@@ -1300,7 +1300,7 @@ local_sqr_p521:
         str     x10, [x0, #64]
         ret
 
-local_sub_p521:
+p521_jmixadd_local_sub_p521:
         ldp     x5, x6, [x1]
         ldp     x4, x3, [x2]
         subs    x5, x5, x4
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd_alt.S
new file mode 100644
index 00000000000..006d8ddc9f4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd_alt.S
@@ -0,0 +1,882 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on NIST curve P-521 in Jacobian coordinates
+//
+//    extern void p521_jmixadd_alt
+//      (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity. It is assumed that
+// all the coordinates of the input points p1 and p2 are fully reduced
+// mod p_521, that the z coordinate of p1 is nonzero and that neither
+// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine
+// point as".
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 72
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x26
+#define input_x x27
+#define input_y x28
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds exactly to bignum_mul_p521_alt
+
+#define mul_p521(P0,P1,P2)                      \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x15, x3, x5 __LF                   \
+        umulh   x16, x3, x5 __LF                   \
+        mul     x14, x3, x6 __LF                   \
+        umulh   x17, x3, x6 __LF                   \
+        adds    x16, x16, x14 __LF                 \
+        ldp     x7, x8, [P2+16] __LF               \
+        mul     x14, x3, x7 __LF                   \
+        umulh   x19, x3, x7 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        umulh   x20, x3, x8 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        ldp     x9, x10, [P2+32] __LF              \
+        mul     x14, x3, x9 __LF                   \
+        umulh   x21, x3, x9 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        umulh   x22, x3, x10 __LF                  \
+        adcs    x21, x21, x14 __LF                 \
+        ldp     x11, x12, [P2+48] __LF             \
+        mul     x14, x3, x11 __LF                  \
+        umulh   x23, x3, x11 __LF                  \
+        adcs    x22, x22, x14 __LF                 \
+        ldr     x13, [P2+64] __LF                  \
+        mul     x14, x3, x12 __LF                  \
+        umulh   x24, x3, x12 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        umulh   x1, x3, x13 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        mul     x14, x4, x5 __LF                   \
+        adds    x16, x16, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        cset    x0, hs __LF                        \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x0, x0, x14 __LF                   \
+        stp     x15, x16, [P0] __LF                \
+        ldp     x3, x4, [P1+16] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x17, x17, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        cset    x15, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x15, x15, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x19, x19, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        cset    x16, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x16, x16, x14 __LF                 \
+        stp     x17, x19, [P0+16] __LF             \
+        ldp     x3, x4, [P1+32] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x20, x20, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x21, x21, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        cset    x17, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x21, x21, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x17, x17, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x21, x21, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x22, x22, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        cset    x19, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x22, x22, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x19, x19, x14 __LF                 \
+        stp     x20, x21, [P0+32] __LF             \
+        ldp     x3, x4, [P1+48] __LF               \
+        mul     x14, x3, x5 __LF                   \
+        adds    x22, x22, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x23, x23, x14 __LF                 \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x23, x23, x14 __LF                 \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x13 __LF                  \
+        adc     x20, x20, x14 __LF                 \
+        mul     x14, x4, x5 __LF                   \
+        adds    x23, x23, x14 __LF                 \
+        mul     x14, x4, x6 __LF                   \
+        adcs    x24, x24, x14 __LF                 \
+        mul     x14, x4, x7 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x4, x8 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x4, x9 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x4, x10 __LF                  \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x4, x11 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x4, x12 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x4, x13 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        cset    x21, hs __LF                       \
+        umulh   x14, x4, x5 __LF                   \
+        adds    x24, x24, x14 __LF                 \
+        umulh   x14, x4, x6 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        umulh   x14, x4, x7 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x4, x9 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x4, x10 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x4, x11 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x4, x12 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x4, x13 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        stp     x22, x23, [P0+48] __LF             \
+        ldr     x3, [P1+64] __LF                   \
+        mul     x14, x3, x5 __LF                   \
+        adds    x24, x24, x14 __LF                 \
+        mul     x14, x3, x6 __LF                   \
+        adcs    x1, x1, x14 __LF                   \
+        mul     x14, x3, x7 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        mul     x14, x3, x8 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        mul     x14, x3, x9 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        mul     x14, x3, x10 __LF                  \
+        adcs    x17, x17, x14 __LF                 \
+        mul     x14, x3, x11 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        mul     x14, x3, x12 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        mul     x14, x3, x13 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        umulh   x14, x3, x5 __LF                   \
+        adds    x1, x1, x14 __LF                   \
+        umulh   x14, x3, x6 __LF                   \
+        adcs    x0, x0, x14 __LF                   \
+        umulh   x14, x3, x7 __LF                   \
+        adcs    x15, x15, x14 __LF                 \
+        umulh   x14, x3, x8 __LF                   \
+        adcs    x16, x16, x14 __LF                 \
+        umulh   x14, x3, x9 __LF                   \
+        adcs    x17, x17, x14 __LF                 \
+        umulh   x14, x3, x10 __LF                  \
+        adcs    x19, x19, x14 __LF                 \
+        umulh   x14, x3, x11 __LF                  \
+        adcs    x20, x20, x14 __LF                 \
+        umulh   x14, x3, x12 __LF                  \
+        adc     x21, x21, x14 __LF                 \
+        cmp     xzr, xzr __LF                      \
+        ldp     x5, x6, [P0] __LF                  \
+        extr    x14, x1, x24, #9 __LF              \
+        adcs    x5, x5, x14 __LF                   \
+        extr    x14, x0, x1, #9 __LF               \
+        adcs    x6, x6, x14 __LF                   \
+        ldp     x7, x8, [P0+16] __LF               \
+        extr    x14, x15, x0, #9 __LF              \
+        adcs    x7, x7, x14 __LF                   \
+        extr    x14, x16, x15, #9 __LF             \
+        adcs    x8, x8, x14 __LF                   \
+        ldp     x9, x10, [P0+32] __LF              \
+        extr    x14, x17, x16, #9 __LF             \
+        adcs    x9, x9, x14 __LF                   \
+        extr    x14, x19, x17, #9 __LF             \
+        adcs    x10, x10, x14 __LF                 \
+        ldp     x11, x12, [P0+48] __LF             \
+        extr    x14, x20, x19, #9 __LF             \
+        adcs    x11, x11, x14 __LF                 \
+        extr    x14, x21, x20, #9 __LF             \
+        adcs    x12, x12, x14 __LF                 \
+        orr     x13, x24, #0xfffffffffffffe00 __LF \
+        lsr     x14, x21, #9 __LF                  \
+        adcs    x13, x13, x14 __LF                 \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbc     x13, x13, xzr __LF                 \
+        and     x13, x13, #0x1ff __LF              \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
+        str     x13, [P0+64]
+
+// Corresponds exactly to bignum_sqr_p521_alt
+
+#define sqr_p521(P0,P1)                         \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x11, x2, x3 __LF                   \
+        umulh   x12, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x10, x2, x4 __LF                   \
+        umulh   x13, x2, x4 __LF                   \
+        adds    x12, x12, x10 __LF                 \
+        ldp     x6, x7, [P1+32] __LF               \
+        mul     x10, x2, x5 __LF                   \
+        umulh   x14, x2, x5 __LF                   \
+        adcs    x13, x13, x10 __LF                 \
+        ldp     x8, x9, [P1+48] __LF               \
+        mul     x10, x2, x6 __LF                   \
+        umulh   x15, x2, x6 __LF                   \
+        adcs    x14, x14, x10 __LF                 \
+        mul     x10, x2, x7 __LF                   \
+        umulh   x16, x2, x7 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        mul     x10, x2, x8 __LF                   \
+        umulh   x17, x2, x8 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        mul     x10, x2, x9 __LF                   \
+        umulh   x19, x2, x9 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        adc     x19, x19, xzr __LF                 \
+        mul     x10, x3, x4 __LF                   \
+        adds    x13, x13, x10 __LF                 \
+        mul     x10, x3, x5 __LF                   \
+        adcs    x14, x14, x10 __LF                 \
+        mul     x10, x3, x6 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        mul     x10, x3, x7 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        mul     x10, x3, x8 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        mul     x10, x3, x9 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        cset    x20, hs __LF                       \
+        umulh   x10, x3, x4 __LF                   \
+        adds    x14, x14, x10 __LF                 \
+        umulh   x10, x3, x5 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        umulh   x10, x3, x6 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        umulh   x10, x3, x7 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        umulh   x10, x3, x8 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        umulh   x10, x3, x9 __LF                   \
+        adc     x20, x20, x10 __LF                 \
+        mul     x10, x6, x7 __LF                   \
+        umulh   x21, x6, x7 __LF                   \
+        adds    x20, x20, x10 __LF                 \
+        adc     x21, x21, xzr __LF                 \
+        mul     x10, x4, x5 __LF                   \
+        adds    x15, x15, x10 __LF                 \
+        mul     x10, x4, x6 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        mul     x10, x4, x7 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        mul     x10, x4, x8 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        mul     x10, x4, x9 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x6, x8 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        cset    x22, hs __LF                       \
+        umulh   x10, x4, x5 __LF                   \
+        adds    x16, x16, x10 __LF                 \
+        umulh   x10, x4, x6 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        umulh   x10, x4, x7 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        umulh   x10, x4, x8 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        umulh   x10, x4, x9 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x6, x8 __LF                   \
+        adc     x22, x22, x10 __LF                 \
+        mul     x10, x7, x8 __LF                   \
+        umulh   x23, x7, x8 __LF                   \
+        adds    x22, x22, x10 __LF                 \
+        adc     x23, x23, xzr __LF                 \
+        mul     x10, x5, x6 __LF                   \
+        adds    x17, x17, x10 __LF                 \
+        mul     x10, x5, x7 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        mul     x10, x5, x8 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x5, x9 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        mul     x10, x6, x9 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        mul     x10, x7, x9 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        cset    x24, hs __LF                       \
+        umulh   x10, x5, x6 __LF                   \
+        adds    x19, x19, x10 __LF                 \
+        umulh   x10, x5, x7 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        umulh   x10, x5, x8 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x5, x9 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        umulh   x10, x6, x9 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        umulh   x10, x7, x9 __LF                   \
+        adc     x24, x24, x10 __LF                 \
+        mul     x10, x8, x9 __LF                   \
+        umulh   x25, x8, x9 __LF                   \
+        adds    x24, x24, x10 __LF                 \
+        adc     x25, x25, xzr __LF                 \
+        adds    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        adcs    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adcs    x17, x17, x17 __LF                 \
+        adcs    x19, x19, x19 __LF                 \
+        adcs    x20, x20, x20 __LF                 \
+        adcs    x21, x21, x21 __LF                 \
+        adcs    x22, x22, x22 __LF                 \
+        adcs    x23, x23, x23 __LF                 \
+        adcs    x24, x24, x24 __LF                 \
+        adcs    x25, x25, x25 __LF                 \
+        cset    x0, hs __LF                        \
+        umulh   x10, x2, x2 __LF                   \
+        adds    x11, x11, x10 __LF                 \
+        mul     x10, x3, x3 __LF                   \
+        adcs    x12, x12, x10 __LF                 \
+        umulh   x10, x3, x3 __LF                   \
+        adcs    x13, x13, x10 __LF                 \
+        mul     x10, x4, x4 __LF                   \
+        adcs    x14, x14, x10 __LF                 \
+        umulh   x10, x4, x4 __LF                   \
+        adcs    x15, x15, x10 __LF                 \
+        mul     x10, x5, x5 __LF                   \
+        adcs    x16, x16, x10 __LF                 \
+        umulh   x10, x5, x5 __LF                   \
+        adcs    x17, x17, x10 __LF                 \
+        mul     x10, x6, x6 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        umulh   x10, x6, x6 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x7, x7 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x7, x7 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        mul     x10, x8, x8 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        umulh   x10, x8, x8 __LF                   \
+        adcs    x24, x24, x10 __LF                 \
+        mul     x10, x9, x9 __LF                   \
+        adcs    x25, x25, x10 __LF                 \
+        umulh   x10, x9, x9 __LF                   \
+        adc     x0, x0, x10 __LF                   \
+        ldr     x1, [P1+64] __LF                   \
+        add     x1, x1, x1 __LF                    \
+        mul     x10, x1, x2 __LF                   \
+        adds    x19, x19, x10 __LF                 \
+        umulh   x10, x1, x2 __LF                   \
+        adcs    x20, x20, x10 __LF                 \
+        mul     x10, x1, x4 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        umulh   x10, x1, x4 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        mul     x10, x1, x6 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        umulh   x10, x1, x6 __LF                   \
+        adcs    x24, x24, x10 __LF                 \
+        mul     x10, x1, x8 __LF                   \
+        adcs    x25, x25, x10 __LF                 \
+        umulh   x10, x1, x8 __LF                   \
+        adcs    x0, x0, x10 __LF                   \
+        lsr     x4, x1, #1 __LF                    \
+        mul     x4, x4, x4 __LF                    \
+        adc     x4, x4, xzr __LF                   \
+        mul     x10, x1, x3 __LF                   \
+        adds    x20, x20, x10 __LF                 \
+        umulh   x10, x1, x3 __LF                   \
+        adcs    x21, x21, x10 __LF                 \
+        mul     x10, x1, x5 __LF                   \
+        adcs    x22, x22, x10 __LF                 \
+        umulh   x10, x1, x5 __LF                   \
+        adcs    x23, x23, x10 __LF                 \
+        mul     x10, x1, x7 __LF                   \
+        adcs    x24, x24, x10 __LF                 \
+        umulh   x10, x1, x7 __LF                   \
+        adcs    x25, x25, x10 __LF                 \
+        mul     x10, x1, x9 __LF                   \
+        adcs    x0, x0, x10 __LF                   \
+        umulh   x10, x1, x9 __LF                   \
+        adc     x4, x4, x10 __LF                   \
+        mul     x2, x2, x2 __LF                    \
+        cmp     xzr, xzr __LF                      \
+        extr    x10, x20, x19, #9 __LF             \
+        adcs    x2, x2, x10 __LF                   \
+        extr    x10, x21, x20, #9 __LF             \
+        adcs    x11, x11, x10 __LF                 \
+        extr    x10, x22, x21, #9 __LF             \
+        adcs    x12, x12, x10 __LF                 \
+        extr    x10, x23, x22, #9 __LF             \
+        adcs    x13, x13, x10 __LF                 \
+        extr    x10, x24, x23, #9 __LF             \
+        adcs    x14, x14, x10 __LF                 \
+        extr    x10, x25, x24, #9 __LF             \
+        adcs    x15, x15, x10 __LF                 \
+        extr    x10, x0, x25, #9 __LF              \
+        adcs    x16, x16, x10 __LF                 \
+        extr    x10, x4, x0, #9 __LF               \
+        adcs    x17, x17, x10 __LF                 \
+        orr     x19, x19, #0xfffffffffffffe00 __LF \
+        lsr     x10, x4, #9 __LF                   \
+        adcs    x19, x19, x10 __LF                 \
+        sbcs    x2, x2, xzr __LF                   \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbcs    x15, x15, xzr __LF                 \
+        sbcs    x16, x16, xzr __LF                 \
+        sbcs    x17, x17, xzr __LF                 \
+        sbc     x19, x19, xzr __LF                 \
+        and     x19, x19, #0x1ff __LF              \
+        stp     x2, x11, [P0] __LF                 \
+        stp     x12, x13, [P0+16] __LF             \
+        stp     x14, x15, [P0+32] __LF             \
+        stp     x16, x17, [P0+48] __LF             \
+        str     x19, [P0+64]
+
+// Corresponds exactly to bignum_sub_p521
+
+#define sub_p521(P0,P1,P2)                      \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        ldp     x9, x10, [P1+32] __LF              \
+        ldp     x4, x3, [P2+32] __LF               \
+        sbcs    x9, x9, x4 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        ldp     x11, x12, [P1+48] __LF             \
+        ldp     x4, x3, [P2+48] __LF               \
+        sbcs    x11, x11, x4 __LF                  \
+        sbcs    x12, x12, x3 __LF                  \
+        ldr     x13, [P1+64] __LF                  \
+        ldr     x4, [P2+64] __LF                   \
+        sbcs    x13, x13, x4 __LF                  \
+        sbcs    x5, x5, xzr __LF                   \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbcs    x11, x11, xzr __LF                 \
+        sbcs    x12, x12, xzr __LF                 \
+        sbcs    x13, x13, xzr __LF                 \
+        and     x13, x13, #0x1ff __LF              \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16] __LF               \
+        stp     x9, x10, [P0+32] __LF              \
+        stp     x11, x12, [P0+48] __LF             \
+        str     x13, [P0+64]
+
+S2N_BN_SYMBOL(p521_jmixadd_alt):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x22, [sp, #-16]!
+        stp     x23, x24, [sp, #-16]!
+        stp     x25, x26, [sp, #-16]!
+        stp     x27, x28, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p521(zp2,z_1)
+        mul_p521(y2a,z_1,y_2)
+
+        mul_p521(x2a,zp2,x_2)
+        mul_p521(y2a,zp2,y2a)
+
+        sub_p521(xd,x2a,x_1)
+        sub_p521(yd,y2a,y_1)
+
+        sqr_p521(zz,xd)
+        sqr_p521(ww,yd)
+
+        mul_p521(zzx1,zz,x_1)
+        mul_p521(zzx2,zz,x2a)
+
+        sub_p521(resx,ww,zzx1)
+        sub_p521(t1,zzx2,zzx1)
+
+        mul_p521(resz,xd,z_1)
+
+        sub_p521(resx,resx,zzx2)
+
+        sub_p521(t2,zzx1,resx)
+
+        mul_p521(t1,t1,y_1)
+        mul_p521(t2,yd,t2)
+
+        sub_p521(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        ldp     x0, x1, [z_1]
+        orr     x0, x0, x1
+        ldp     x2, x3, [z_1+16]
+        orr     x2, x2, x3
+        ldp     x4, x5, [z_1+32]
+        orr     x4, x4, x5
+        ldp     x6, x7, [z_1+48]
+        orr     x6, x6, x7
+        ldr     x8, [z_1+64]
+        orr     x0, x0, x2
+        orr     x4, x4, x6
+        orr     x0, x0, x4
+        orr     x0, x0, x8
+        cmp     x0, xzr
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with an extra z = 1
+// coordinate, hence giving 0 + p2 = p2 for the final result.
+
+        ldp     x0, x1, [resx]
+        ldp     x20, x21, [x_2]
+        csel    x0, x0, x20, ne
+        csel    x1, x1, x21, ne
+        ldp     x2, x3, [resx+16]
+        ldp     x20, x21, [x_2+16]
+        csel    x2, x2, x20, ne
+        csel    x3, x3, x21, ne
+        ldp     x4, x5, [resx+32]
+        ldp     x20, x21, [x_2+32]
+        csel    x4, x4, x20, ne
+        csel    x5, x5, x21, ne
+        ldp     x6, x7, [resx+48]
+        ldp     x20, x21, [x_2+48]
+        csel    x6, x6, x20, ne
+        csel    x7, x7, x21, ne
+        ldr     x8, [resx+64]
+        ldr     x20, [x_2+64]
+        csel    x8, x8, x20, ne
+
+        ldp     x10, x11, [resy]
+        ldp     x20, x21, [y_2]
+        csel    x10, x10, x20, ne
+        csel    x11, x11, x21, ne
+        ldp     x12, x13, [resy+16]
+        ldp     x20, x21, [y_2+16]
+        csel    x12, x12, x20, ne
+        csel    x13, x13, x21, ne
+        ldp     x14, x15, [resy+32]
+        ldp     x20, x21, [y_2+32]
+        csel    x14, x14, x20, ne
+        csel    x15, x15, x21, ne
+        ldp     x16, x17, [resy+48]
+        ldp     x20, x21, [y_2+48]
+        csel    x16, x16, x20, ne
+        csel    x17, x17, x21, ne
+        ldr     x19, [resy+64]
+        ldr     x20, [y_2+64]
+        csel    x19, x19, x20, ne
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [x_3+32]
+        stp     x6, x7, [x_3+48]
+        str     x8, [x_3+64]
+        stp     x10, x11, [y_3]
+        stp     x12, x13, [y_3+16]
+        stp     x14, x15, [y_3+32]
+        stp     x16, x17, [y_3+48]
+        str     x19, [y_3+64]
+
+        ldp     x0, x1, [resz]
+        mov     x20, #1
+        csel    x0, x0, x20, ne
+        csel    x1, x1, xzr, ne
+        ldp     x2, x3, [resz+16]
+        csel    x2, x2, xzr, ne
+        csel    x3, x3, xzr, ne
+        ldp     x4, x5, [resz+32]
+        csel    x4, x4, xzr, ne
+        csel    x5, x5, xzr, ne
+        ldp     x6, x7, [resz+48]
+        csel    x6, x6, xzr, ne
+        csel    x7, x7, xzr, ne
+        ldr     x8, [resz+64]
+        csel    x8, x8, xzr, ne
+
+        stp     x0, x1, [z_3]
+        stp     x2, x3, [z_3+16]
+        stp     x4, x5, [z_3+32]
+        stp     x6, x7, [z_3+48]
+        str     x8, [z_3+64]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+
+        ldp     x27, x28, [sp], 16
+        ldp     x25, x26, [sp], 16
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/arm/p521/p521_jscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul.S
similarity index 98%
rename from third_party/s2n-bignum/arm/p521/p521_jscalarmul.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul.S
index 37cc9231302..3c26f7fe97f 100644
--- a/third_party/s2n-bignum/arm/p521/p521_jscalarmul.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul.S
@@ -59,29 +59,29 @@
 #define NSPACE #(55*NUMSIZE+8)
 
 #define selectblock(I)                            \
-        cmp     bf, #(1*I);                       \
-        ldp     x10, x11, [tabup];                \
-        csel    x0, x10, x0, eq;                  \
-        csel    x1, x11, x1, eq;                  \
-        ldp     x10, x11, [tabup, #16];           \
-        csel    x2, x10, x2, eq;                  \
-        csel    x3, x11, x3, eq;                  \
-        ldp     x10, x11, [tabup, #32];           \
-        csel    x4, x10, x4, eq;                  \
-        csel    x5, x11, x5, eq;                  \
-        ldp     x10, x11, [tabup, #48];           \
-        csel    x6, x10, x6, eq;                  \
-        csel    x7, x11, x7, eq;                  \
-        ldr     x10, [tabup, #64];                \
-        csel    x8, x10, x8, eq;                  \
+        cmp     bf, #(1*I) __LF                      \
+        ldp     x10, x11, [tabup] __LF               \
+        csel    x0, x10, x0, eq __LF                 \
+        csel    x1, x11, x1, eq __LF                 \
+        ldp     x10, x11, [tabup, #16] __LF          \
+        csel    x2, x10, x2, eq __LF                 \
+        csel    x3, x11, x3, eq __LF                 \
+        ldp     x10, x11, [tabup, #32] __LF          \
+        csel    x4, x10, x4, eq __LF                 \
+        csel    x5, x11, x5, eq __LF                 \
+        ldp     x10, x11, [tabup, #48] __LF          \
+        csel    x6, x10, x6, eq __LF                 \
+        csel    x7, x11, x7, eq __LF                 \
+        ldr     x10, [tabup, #64] __LF               \
+        csel    x8, x10, x8, eq __LF                 \
         add     tabup, tabup, #JACSIZE
 
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 S2N_BN_SYMBOL(p521_jscalarmul):
diff --git a/third_party/s2n-bignum/arm/p521/p521_jscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul_alt.S
similarity index 98%
rename from third_party/s2n-bignum/arm/p521/p521_jscalarmul_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul_alt.S
index 89e0408d8b4..1f64a9b176f 100644
--- a/third_party/s2n-bignum/arm/p521/p521_jscalarmul_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul_alt.S
@@ -59,29 +59,29 @@
 #define NSPACE #(55*NUMSIZE+8)
 
 #define selectblock(I)                            \
-        cmp     bf, #(1*I);                       \
-        ldp     x10, x11, [tabup];                \
-        csel    x0, x10, x0, eq;                  \
-        csel    x1, x11, x1, eq;                  \
-        ldp     x10, x11, [tabup, #16];           \
-        csel    x2, x10, x2, eq;                  \
-        csel    x3, x11, x3, eq;                  \
-        ldp     x10, x11, [tabup, #32];           \
-        csel    x4, x10, x4, eq;                  \
-        csel    x5, x11, x5, eq;                  \
-        ldp     x10, x11, [tabup, #48];           \
-        csel    x6, x10, x6, eq;                  \
-        csel    x7, x11, x7, eq;                  \
-        ldr     x10, [tabup, #64];                \
-        csel    x8, x10, x8, eq;                  \
+        cmp     bf, #(1*I) __LF                      \
+        ldp     x10, x11, [tabup] __LF               \
+        csel    x0, x10, x0, eq __LF                 \
+        csel    x1, x11, x1, eq __LF                 \
+        ldp     x10, x11, [tabup, #16] __LF          \
+        csel    x2, x10, x2, eq __LF                 \
+        csel    x3, x11, x3, eq __LF                 \
+        ldp     x10, x11, [tabup, #32] __LF          \
+        csel    x4, x10, x4, eq __LF                 \
+        csel    x5, x11, x5, eq __LF                 \
+        ldp     x10, x11, [tabup, #48] __LF          \
+        csel    x6, x10, x6, eq __LF                 \
+        csel    x7, x11, x7, eq __LF                 \
+        ldr     x10, [tabup, #64] __LF               \
+        csel    x8, x10, x8, eq __LF                 \
         add     tabup, tabup, #JACSIZE
 
 // Loading large constants
 
 #define movbig(nn,n3,n2,n1,n0)                                      \
-        movz    nn, n0;                                             \
-        movk    nn, n1, lsl #16;                                    \
-        movk    nn, n2, lsl #32;                                    \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
         movk    nn, n3, lsl #48
 
 S2N_BN_SYMBOL(p521_jscalarmul_alt):
diff --git a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montmul_p521_base.S
similarity index 78%
rename from third_party/s2n-bignum/arm/p521/bignum_montmul_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montmul_p521_base.S
index e1ea8dc0c22..81ba5660a1c 100644
--- a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montmul_p521_base.S
@@ -5,20 +5,20 @@
 // Montgomery multiply, z := (x * y / 2^576) mod p_521
 // Inputs x[9], y[9]; output z[9]
 //
-//    extern void bignum_montmul_p521
+//    extern void bignum_montmul_p521_base
 //     (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
 //
 // Does z := (x * y / 2^576) mod p_521, assuming x < p_521, y < p_521. This
 // means the Montgomery base is the "native size" 2^{9*64} = 2^576; since
-// p_521 is a Mersenne prime the basic modular multiplication bignum_mul_p521
+// p_521 is a Mersenne prime the basic modular multiplication bignum_mul_p521_base
 // can be considered a Montgomery operation to base 2^521.
 //
 // Standard ARM ABI: X0 = z, X1 = x, X2 = y
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521_base)
         .text
         .balign 4
 
@@ -33,18 +33,18 @@
 // ---------------------------------------------------------------------------
 
 #define muldiffnadd(b,a,x,y,w,z)        \
-        subs    t, x, y;                \
-        cneg    t, t, cc;               \
-        csetm   c, cc;                  \
-        subs    h, w, z;                \
-        cneg    h, h, cc;               \
-        mul     l, t, h;                \
-        umulh   h, t, h;                \
-        cinv    c, c, cc;               \
-        adds    xzr, c, #1;             \
-        eor     l, l, c;                \
-        adcs    a, a, l;                \
-        eor     h, h, c;                \
+        subs    t, x, y __LF               \
+        cneg    t, t, cc __LF              \
+        csetm   c, cc __LF                 \
+        subs    h, w, z __LF               \
+        cneg    h, h, cc __LF              \
+        mul     l, t, h __LF               \
+        umulh   h, t, h __LF               \
+        cinv    c, c, cc __LF              \
+        adds    xzr, c, #1 __LF            \
+        eor     l, l, c __LF               \
+        adcs    a, a, l __LF               \
+        eor     h, h, c __LF               \
         adcs    b, b, h
 
 #define z x0
@@ -85,69 +85,69 @@
 #define mul4                                                            \
 /*  First accumulate all the "simple" products as [s7,s6,s5,s4,s0] */   \
         \
-        mul     s0, a0, b0;                                     \
-        mul     s4, a1, b1;                                     \
-        mul     s5, a2, b2;                                     \
-        mul     s6, a3, b3;                                     \
+        mul     s0, a0, b0 __LF                                    \
+        mul     s4, a1, b1 __LF                                    \
+        mul     s5, a2, b2 __LF                                    \
+        mul     s6, a3, b3 __LF                                    \
         \
-        umulh   s7, a0, b0;                                     \
-        adds    s4, s4, s7;                                     \
-        umulh   s7, a1, b1;                                     \
-        adcs    s5, s5, s7;                                     \
-        umulh   s7, a2, b2;                                     \
-        adcs    s6, s6, s7;                                     \
-        umulh   s7, a3, b3;                                     \
-        adc     s7, s7, xzr;                                    \
+        umulh   s7, a0, b0 __LF                                    \
+        adds    s4, s4, s7 __LF                                    \
+        umulh   s7, a1, b1 __LF                                    \
+        adcs    s5, s5, s7 __LF                                    \
+        umulh   s7, a2, b2 __LF                                    \
+        adcs    s6, s6, s7 __LF                                    \
+        umulh   s7, a3, b3 __LF                                    \
+        adc     s7, s7, xzr __LF                                   \
         \
 /*  Multiply by B + 1 to get [s7;s6;s5;s4;s1;s0] */                     \
         \
-        adds    s1, s4, s0;                                     \
-        adcs    s4, s5, s4;                                     \
-        adcs    s5, s6, s5;                                     \
-        adcs    s6, s7, s6;                                     \
-        adc     s7, xzr, s7;                                    \
+        adds    s1, s4, s0 __LF                                    \
+        adcs    s4, s5, s4 __LF                                    \
+        adcs    s5, s6, s5 __LF                                    \
+        adcs    s6, s7, s6 __LF                                    \
+        adc     s7, xzr, s7 __LF                                   \
         \
 /*  Multiply by B^2 + 1 to get [s7;s6;s5;s4;s3;s2;s1;s0] */             \
         \
-        adds    s2, s4, s0;                                     \
-        adcs    s3, s5, s1;                                     \
-        adcs    s4, s6, s4;                                     \
-        adcs    s5, s7, s5;                                     \
-        adcs    s6, xzr, s6;                                    \
-        adc     s7, xzr, s7;                                    \
+        adds    s2, s4, s0 __LF                                    \
+        adcs    s3, s5, s1 __LF                                    \
+        adcs    s4, s6, s4 __LF                                    \
+        adcs    s5, s7, s5 __LF                                    \
+        adcs    s6, xzr, s6 __LF                                   \
+        adc     s7, xzr, s7 __LF                                   \
         \
 /*  Now add in all the "complicated" terms. */                          \
         \
-        muldiffnadd(s6,s5, a2,a3, b3,b2);                       \
-        adc     s7, s7, c;                                      \
+        muldiffnadd(s6,s5, a2,a3, b3,b2) __LF                      \
+        adc     s7, s7, c __LF                                     \
         \
-        muldiffnadd(s2,s1, a0,a1, b1,b0);                       \
-        adcs    s3, s3, c;                                      \
-        adcs    s4, s4, c;                                      \
-        adcs    s5, s5, c;                                      \
-        adcs    s6, s6, c;                                      \
-        adc     s7, s7, c;                                      \
+        muldiffnadd(s2,s1, a0,a1, b1,b0) __LF                      \
+        adcs    s3, s3, c __LF                                     \
+        adcs    s4, s4, c __LF                                     \
+        adcs    s5, s5, c __LF                                     \
+        adcs    s6, s6, c __LF                                     \
+        adc     s7, s7, c __LF                                     \
         \
-        muldiffnadd(s5,s4, a1,a3, b3,b1);                       \
-        adcs    s6, s6, c;                                      \
-        adc     s7, s7, c;                                      \
+        muldiffnadd(s5,s4, a1,a3, b3,b1) __LF                      \
+        adcs    s6, s6, c __LF                                     \
+        adc     s7, s7, c __LF                                     \
         \
-        muldiffnadd(s3,s2, a0,a2, b2,b0);                       \
-        adcs    s4, s4, c;                                      \
-        adcs    s5, s5, c;                                      \
-        adcs    s6, s6, c;                                      \
-        adc     s7, s7, c;                                      \
+        muldiffnadd(s3,s2, a0,a2, b2,b0) __LF                      \
+        adcs    s4, s4, c __LF                                     \
+        adcs    s5, s5, c __LF                                     \
+        adcs    s6, s6, c __LF                                     \
+        adc     s7, s7, c __LF                                     \
         \
-        muldiffnadd(s4,s3, a0,a3, b3,b0);                       \
-        adcs    s5, s5, c;                                      \
-        adcs    s6, s6, c;                                      \
-        adc     s7, s7, c;                                      \
-        muldiffnadd(s4,s3, a1,a2, b2,b1);                       \
-        adcs    s5, s5, c;                                      \
-        adcs    s6, s6, c;                                      \
+        muldiffnadd(s4,s3, a0,a3, b3,b0) __LF                      \
+        adcs    s5, s5, c __LF                                     \
+        adcs    s6, s6, c __LF                                     \
+        adc     s7, s7, c __LF                                     \
+        muldiffnadd(s4,s3, a1,a2, b2,b1) __LF                      \
+        adcs    s5, s5, c __LF                                     \
+        adcs    s6, s6, c __LF                                     \
         adc     s7, s7, c                                       \
 
-S2N_BN_SYMBOL(bignum_montmul_p521):
+S2N_BN_SYMBOL(bignum_montmul_p521_base):
 
 // Save registers and make space for the temporary buffer
 
diff --git a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montsqr_p521_base.S
similarity index 98%
rename from third_party/s2n-bignum/arm/p521/bignum_montsqr_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montsqr_p521_base.S
index 2c8dbd789f8..9ee30e07f3c 100644
--- a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montsqr_p521_base.S
@@ -5,20 +5,20 @@
 // Montgomery square, z := (x^2 / 2^576) mod p_521
 // Input x[9]; output z[9]
 //
-//    extern void bignum_montsqr_p521
+//    extern void bignum_montsqr_p521_base
 //     (uint64_t z[static 9], uint64_t x[static 9]);
 //
 // Does z := (x^2 / 2^576) mod p_521, assuming x < p_521. This means the
 // Montgomery base is the "native size" 2^{9*64} = 2^576; since p_521 is
-// a Mersenne prime the basic modular squaring bignum_sqr_p521 can be
+// a Mersenne prime the basic modular squaring bignum_sqr_p521_base can be
 // considered a Montgomery operation to base 2^521.
 //
 // Standard ARM ABI: X0 = z, X1 = x
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521_base)
         .text
         .balign 4
 
@@ -62,7 +62,7 @@
 #define d7 x9
 #define d8 x10
 
-S2N_BN_SYMBOL(bignum_montsqr_p521):
+S2N_BN_SYMBOL(bignum_montsqr_p521_base):
 
 // Save registers
 
diff --git a/third_party/s2n-bignum/arm/p521/bignum_mul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_mul_p521_base.S
similarity index 78%
rename from third_party/s2n-bignum/arm/p521/bignum_mul_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_mul_p521_base.S
index 97859d6bbec..2c583c17ec3 100644
--- a/third_party/s2n-bignum/arm/p521/bignum_mul_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_mul_p521_base.S
@@ -5,15 +5,15 @@
 // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced
 // Inputs x[9], y[9]; output z[9]
 //
-//    extern void bignum_mul_p521
+//    extern void bignum_mul_p521_base
 //     (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]);
 //
 // Standard ARM ABI: X0 = z, X1 = x, X2 = y
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521_base)
         .text
         .balign 4
 
@@ -28,18 +28,18 @@
 // ---------------------------------------------------------------------------
 
 #define muldiffnadd(b,a,x,y,w,z)        \
-        subs    t, x, y;                \
-        cneg    t, t, cc;               \
-        csetm   c, cc;                  \
-        subs    h, w, z;                \
-        cneg    h, h, cc;               \
-        mul     l, t, h;                \
-        umulh   h, t, h;                \
-        cinv    c, c, cc;               \
-        adds    xzr, c, #1;             \
-        eor     l, l, c;                \
-        adcs    a, a, l;                \
-        eor     h, h, c;                \
+        subs    t, x, y __LF               \
+        cneg    t, t, cc __LF              \
+        csetm   c, cc __LF                 \
+        subs    h, w, z __LF               \
+        cneg    h, h, cc __LF              \
+        mul     l, t, h __LF               \
+        umulh   h, t, h __LF               \
+        cinv    c, c, cc __LF              \
+        adds    xzr, c, #1 __LF            \
+        eor     l, l, c __LF               \
+        adcs    a, a, l __LF               \
+        eor     h, h, c __LF               \
         adcs    b, b, h
 
 #define z x0
@@ -80,69 +80,69 @@
 #define mul4                                                            \
 /*  First accumulate all the "simple" products as [s7,s6,s5,s4,s0] */   \
         \
-        mul     s0, a0, b0;                                     \
-        mul     s4, a1, b1;                                     \
-        mul     s5, a2, b2;                                     \
-        mul     s6, a3, b3;                                     \
+        mul     s0, a0, b0 __LF                                    \
+        mul     s4, a1, b1 __LF                                    \
+        mul     s5, a2, b2 __LF                                    \
+        mul     s6, a3, b3 __LF                                    \
         \
-        umulh   s7, a0, b0;                                     \
-        adds    s4, s4, s7;                                     \
-        umulh   s7, a1, b1;                                     \
-        adcs    s5, s5, s7;                                     \
-        umulh   s7, a2, b2;                                     \
-        adcs    s6, s6, s7;                                     \
-        umulh   s7, a3, b3;                                     \
-        adc     s7, s7, xzr;                                    \
+        umulh   s7, a0, b0 __LF                                    \
+        adds    s4, s4, s7 __LF                                    \
+        umulh   s7, a1, b1 __LF                                    \
+        adcs    s5, s5, s7 __LF                                    \
+        umulh   s7, a2, b2 __LF                                    \
+        adcs    s6, s6, s7 __LF                                    \
+        umulh   s7, a3, b3 __LF                                    \
+        adc     s7, s7, xzr __LF                                   \
         \
 /*  Multiply by B + 1 to get [s7;s6;s5;s4;s1;s0] */                     \
         \
-        adds    s1, s4, s0;                                     \
-        adcs    s4, s5, s4;                                     \
-        adcs    s5, s6, s5;                                     \
-        adcs    s6, s7, s6;                                     \
-        adc     s7, xzr, s7;                                    \
+        adds    s1, s4, s0 __LF                                    \
+        adcs    s4, s5, s4 __LF                                    \
+        adcs    s5, s6, s5 __LF                                    \
+        adcs    s6, s7, s6 __LF                                    \
+        adc     s7, xzr, s7 __LF                                   \
         \
 /*  Multiply by B^2 + 1 to get [s7;s6;s5;s4;s3;s2;s1;s0] */             \
         \
-        adds    s2, s4, s0;                                     \
-        adcs    s3, s5, s1;                                     \
-        adcs    s4, s6, s4;                                     \
-        adcs    s5, s7, s5;                                     \
-        adcs    s6, xzr, s6;                                    \
-        adc     s7, xzr, s7;                                    \
+        adds    s2, s4, s0 __LF                                    \
+        adcs    s3, s5, s1 __LF                                    \
+        adcs    s4, s6, s4 __LF                                    \
+        adcs    s5, s7, s5 __LF                                    \
+        adcs    s6, xzr, s6 __LF                                   \
+        adc     s7, xzr, s7 __LF                                   \
         \
 /*  Now add in all the "complicated" terms. */                          \
         \
-        muldiffnadd(s6,s5, a2,a3, b3,b2);                       \
-        adc     s7, s7, c;                                      \
+        muldiffnadd(s6,s5, a2,a3, b3,b2) __LF                      \
+        adc     s7, s7, c __LF                                     \
         \
-        muldiffnadd(s2,s1, a0,a1, b1,b0);                       \
-        adcs    s3, s3, c;                                      \
-        adcs    s4, s4, c;                                      \
-        adcs    s5, s5, c;                                      \
-        adcs    s6, s6, c;                                      \
-        adc     s7, s7, c;                                      \
+        muldiffnadd(s2,s1, a0,a1, b1,b0) __LF                      \
+        adcs    s3, s3, c __LF                                     \
+        adcs    s4, s4, c __LF                                     \
+        adcs    s5, s5, c __LF                                     \
+        adcs    s6, s6, c __LF                                     \
+        adc     s7, s7, c __LF                                     \
         \
-        muldiffnadd(s5,s4, a1,a3, b3,b1);                       \
-        adcs    s6, s6, c;                                      \
-        adc     s7, s7, c;                                      \
+        muldiffnadd(s5,s4, a1,a3, b3,b1) __LF                      \
+        adcs    s6, s6, c __LF                                     \
+        adc     s7, s7, c __LF                                     \
         \
-        muldiffnadd(s3,s2, a0,a2, b2,b0);                       \
-        adcs    s4, s4, c;                                      \
-        adcs    s5, s5, c;                                      \
-        adcs    s6, s6, c;                                      \
-        adc     s7, s7, c;                                      \
+        muldiffnadd(s3,s2, a0,a2, b2,b0) __LF                      \
+        adcs    s4, s4, c __LF                                     \
+        adcs    s5, s5, c __LF                                     \
+        adcs    s6, s6, c __LF                                     \
+        adc     s7, s7, c __LF                                     \
         \
-        muldiffnadd(s4,s3, a0,a3, b3,b0);                       \
-        adcs    s5, s5, c;                                      \
-        adcs    s6, s6, c;                                      \
-        adc     s7, s7, c;                                      \
-        muldiffnadd(s4,s3, a1,a2, b2,b1);                       \
-        adcs    s5, s5, c;                                      \
-        adcs    s6, s6, c;                                      \
+        muldiffnadd(s4,s3, a0,a3, b3,b0) __LF                      \
+        adcs    s5, s5, c __LF                                     \
+        adcs    s6, s6, c __LF                                     \
+        adc     s7, s7, c __LF                                     \
+        muldiffnadd(s4,s3, a1,a2, b2,b1) __LF                      \
+        adcs    s5, s5, c __LF                                     \
+        adcs    s6, s6, c __LF                                     \
         adc     s7, s7, c                                       \
 
-S2N_BN_SYMBOL(bignum_mul_p521):
+S2N_BN_SYMBOL(bignum_mul_p521_base):
 
 // Save registers and make space for the temporary buffer
 
diff --git a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_sqr_p521_base.S
similarity index 98%
rename from third_party/s2n-bignum/arm/p521/bignum_sqr_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_sqr_p521_base.S
index 404665258c4..937e7234ed4 100644
--- a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_sqr_p521_base.S
@@ -5,14 +5,14 @@
 // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced
 // Input x[9]; output z[9]
 //
-//    extern void bignum_sqr_p521 (uint64_t z[static 9], uint64_t x[static 9]);
+//    extern void bignum_sqr_p521_base (uint64_t z[static 9], uint64_t x[static 9]);
 //
 // Standard ARM ABI: X0 = z, X1 = x
 // ----------------------------------------------------------------------------
 #include "_internal_s2n_bignum.h"
 
-        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521)
-        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521_base)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521_base)
         .text
         .balign 4
 
@@ -56,7 +56,7 @@
 #define d7 x9
 #define d8 x10
 
-S2N_BN_SYMBOL(bignum_sqr_p521):
+S2N_BN_SYMBOL(bignum_sqr_p521_base):
 
 // Save registers
 
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/Makefile
new file mode 100644
index 00000000000..5ba07a7d7fc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/Makefile
@@ -0,0 +1,56 @@
+#############################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+#############################################################################
+
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
+# use a cross-assembling version so that the code can still be assembled
+# and the proofs checked against the object files (though you won't be able
+# to run code without additional emulation infrastructure). The aarch64
+# cross-assembling version can be installed manually by something like:
+#
+#  sudo apt-get install binutils-aarch64-linux-gnu
+
+UNAME_RESULT=$(shell uname -p)
+
+ifeq ($(UNAME_RESULT),aarch64)
+GAS=as
+else
+GAS=aarch64-linux-gnu-as
+endif
+
+# List of object files
+
+OBJ = bignum_add_p256k1.o \
+      bignum_cmul_p256k1.o \
+      bignum_deamont_p256k1.o \
+      bignum_demont_p256k1.o \
+      bignum_double_p256k1.o \
+      bignum_half_p256k1.o \
+      bignum_mod_n256k1_4.o \
+      bignum_mod_p256k1_4.o \
+      bignum_montmul_p256k1.o \
+      bignum_montmul_p256k1_alt.o \
+      bignum_montsqr_p256k1.o \
+      bignum_montsqr_p256k1_alt.o \
+      bignum_mul_p256k1.o \
+      bignum_mul_p256k1_alt.o \
+      bignum_neg_p256k1.o \
+      bignum_optneg_p256k1.o \
+      bignum_sqr_p256k1.o \
+      bignum_sqr_p256k1_alt.o \
+      bignum_sub_p256k1.o \
+      bignum_tomont_p256k1.o \
+      bignum_triple_p256k1.o \
+      secp256k1_jadd.o \
+      secp256k1_jadd_alt.o \
+      secp256k1_jdouble.o \
+      secp256k1_jdouble_alt.o \
+      secp256k1_jmixadd.o \
+      secp256k1_jmixadd_alt.o
+
+%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ -
+
+default: $(OBJ);
+
+clean:; rm -f *.o *.correct
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_add_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_add_p256k1.S
new file mode 100644
index 00000000000..7ca98e3e86e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_add_p256k1.S
@@ -0,0 +1,79 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_add_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p256k1)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+#define c x3
+#define l x4
+#define d0 x5
+#define d1 x6
+#define d2 x7
+#define d3 x8
+#define dd x9
+
+S2N_BN_SYMBOL(bignum_add_p256k1):
+
+// First just add the numbers as z = x + y = 2^256 * c + [d3; d2; d1; d0]
+// Also create dd = d3 AND d2 AND d1 to condense the later comparison
+
+        ldp     d0, d1, [x]
+        ldp     l, c, [y]
+        adds    d0, d0, l
+        adcs    d1, d1, c
+        ldp     d2, d3, [x, #16]
+        ldp     l, c, [y, #16]
+        adcs    d2, d2, l
+        and     dd, d1, d2
+        adcs    d3, d3, c
+        and     dd, dd, d3
+        adc     c, xzr, xzr
+
+// Let l = 4294968273 so that p_256k1 = 2^256 - l
+
+        mov     l, #977
+        orr     l, l, #0x100000000
+
+// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256
+
+        adds    xzr, d0, l
+        adcs    xzr, dd, xzr
+        adcs    c, c, xzr
+
+// Now c <> 0 <=> z >= p_256k1, so mask the constant l accordingly
+
+        csel    l, l, xzr, ne
+
+// If z >= p_256k1 do z := z - p_256k1, i.e. add l in 4 digits
+
+        adds    d0, d0, l
+        adcs    d1, d1, xzr
+        adcs    d2, d2, xzr
+        adc     d3, d3, xzr
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_cmul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_cmul_p256k1.S
new file mode 100644
index 00000000000..b287742bd2b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_cmul_p256k1.S
@@ -0,0 +1,95 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_p256k1
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = c, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256k1)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256k1_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define m x1
+#define x x2
+
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define a0 x7
+#define a1 x8
+
+#define a2 x9
+#define c x9
+
+#define a3 x10
+#define h x10
+#define q x10
+
+
+S2N_BN_SYMBOL(bignum_cmul_p256k1):
+
+S2N_BN_SYMBOL(bignum_cmul_p256k1_alt):
+
+// First do the multiply, straightforwardly to get [h;d3;d2;d1;d0]
+
+        ldp     a0, a1, [x]
+        ldp     a2, a3, [x, #16]
+        mul     d0, m, a0
+        mul     d1, m, a1
+        mul     d2, m, a2
+        mul     d3, m, a3
+        umulh   a0, m, a0
+        umulh   a1, m, a1
+        umulh   a2, m, a2
+        umulh   h, m, a3
+        adds    d1, d1, a0
+        adcs    d2, d2, a1
+        adcs    d3, d3, a2
+        adcs    h, h, xzr
+
+// Now the quotient estimate is q = h + 1, and then we do the reduction,
+// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 =
+// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q)
+
+        add     q, h, #1
+        mov     c, #977
+        orr     c, c, #0x100000000
+        mul     a0, q, c
+        umulh   a1, q, c
+        adds    d0, d0, a0
+        adcs    d1, d1, a1
+        adcs    d2, d2, xzr
+        adcs    d3, d3, xzr
+
+// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF
+// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c.
+
+        csel    c, c, xzr, cc
+        subs    d0, d0, c
+        sbcs    d1, d1, xzr
+        sbcs    d2, d2, xzr
+        sbc     d3, d3, xzr
+
+// Finally store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_deamont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_deamont_p256k1.S
new file mode 100644
index 00000000000..245b433844a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_deamont_p256k1.S
@@ -0,0 +1,110 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from Montgomery form z := (x / 2^256) mod p_256k1,
+// Input x[4]; output z[4]
+//
+//    extern void bignum_deamont_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form,
+// "almost" meaning any 4-digit input will work, with no range restriction.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256k1)
+        .text
+        .balign 4
+
+// Input parameters
+
+#define z x0
+#define x x1
+
+// Rotating registers for the intermediate windows
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+
+// Other temporaries
+
+#define t x6
+#define w x7
+#define c x8
+#define dd x9
+
+S2N_BN_SYMBOL(bignum_deamont_p256k1):
+
+// Load input and set up constants c = 4294968273 so p_256k1 = 2^256 - c,
+// and w the negated multiplicative inverse p_256k1 * w == -1 (mod 2^64).
+
+        ldp     d0, d1, [x]
+        movz    w, #0x3531
+        movk    w, #0xd225, lsl #16
+        ldp     d2, d3, [x, #16]
+        movk    w, #0x091d, lsl #32
+        movk    w, #0xd838, lsl #48
+        mov     c, #977
+        orr     c, c, #0x100000000
+
+// Four stages of Montgomery reduction, rotating the register window
+// Let dd be the AND of all 4 words of the cofactor q as it is computed
+
+        mul     d0, w, d0
+        umulh   t, d0, c
+        subs    d1, d1, t
+
+        mul     d1, w, d1
+        umulh   t, d1, c
+        and     dd, d0, d1
+        sbcs    d2, d2, t
+
+        mul     d2, w, d2
+        umulh   t, d2, c
+        and     dd, dd, d2
+        sbcs    d3, d3, t
+
+        mul     d3, w, d3
+        umulh   t, d3, c
+        and     dd, dd, d3
+        sbcs    d0, d0, t
+
+        sbcs    d1, d1, xzr
+        sbcs    d2, d2, xzr
+        sbc     d3, d3, xzr
+
+// The result thus far is z = (x + q * p_256k1) / 2^256. Note that
+// z < p_256k1 <=> x < (2^256 - q) * p_256k1, and since
+// x < 2^256 < 2 * p_256k1, we have that *if* q < 2^256 - 1 then
+// z < p_256k1. Conversely if q = 2^256 - 1 then since
+// x + q * p_256k1 == 0 (mod 2^256) we have x == p_256k1 (mod 2^256)
+// and thus x = p_256k1, and z >= p_256k1 (in fact z = p_256k1).
+// So in summary z < p_256k1 <=> ~(q = 2^256 - 1) <=> ~(x = p_256k1).
+// and hence iff q is all 1s, or equivalently dd is all 1s, we
+// correct by subtracting p_256k1 to get 0. Since this is only one
+// case we compute the result more explicitly rather than doing
+// arithmetic with carry propagation.
+
+        add     c, c, d0
+        cmp     dd, #-1
+        csel    d0, c, d0, eq
+        csel    d1, xzr, d1, eq
+        csel    d2, xzr, d2, eq
+        csel    d3, xzr, d3, eq
+
+// Write back result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_demont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_demont_p256k1.S
new file mode 100644
index 00000000000..bbea9c18c50
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_demont_p256k1.S
@@ -0,0 +1,87 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from Montgomery form z := (x / 2^256) mod p_256k1,
+// assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_demont_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// This assumes the input is < p_256k1 for correctness. If this is not the
+// case, use the variant "bignum_deamont_p256k1" instead.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256k1)
+        .text
+        .balign 4
+
+// Input parameters
+
+#define z x0
+#define x x1
+
+// Rotating registers for the intermediate windows
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+
+// Other temporaries
+
+#define t x6
+#define w x7
+#define c x8
+
+S2N_BN_SYMBOL(bignum_demont_p256k1):
+
+// Load input and set up constants c = 4294968273 so p_256k1 = 2^256 - c,
+// and w the negated multiplicative inverse p_256k1 * w == -1 (mod 2^64).
+
+        ldp     d0, d1, [x]
+        movz    w, #0x3531
+        movk    w, #0xd225, lsl #16
+        ldp     d2, d3, [x, #16]
+        movk    w, #0x091d, lsl #32
+        movk    w, #0xd838, lsl #48
+        mov     c, #977
+        orr     c, c, #0x100000000
+
+// Four stages of Montgomery reduction, rotating the register window
+
+        mul     d0, w, d0
+        umulh   t, d0, c
+        subs    d1, d1, t
+
+        mul     d1, w, d1
+        umulh   t, d1, c
+        sbcs    d2, d2, t
+
+        mul     d2, w, d2
+        umulh   t, d2, c
+        sbcs    d3, d3, t
+
+        mul     d3, w, d3
+        umulh   t, d3, c
+        sbcs    d0, d0, t
+
+        sbcs    d1, d1, xzr
+        sbcs    d2, d2, xzr
+        sbc     d3, d3, xzr
+
+// Write back result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_double_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_double_p256k1.S
new file mode 100644
index 00000000000..b54c46323c6
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_double_p256k1.S
@@ -0,0 +1,76 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_double_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p256k1)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define c x6
+#define dd x7
+#define l x8
+
+S2N_BN_SYMBOL(bignum_double_p256k1):
+
+// Load the inputs and double top-down as z = 2^256 * c + [d3;d2;d1;d0]
+// While doing this, create an AND dd of [d3;d2;d1] to condense comparison
+
+        ldp     d2, d3, [x, #16]
+        lsr     c, d3, #63
+        extr    d3, d3, d2, #63
+        ldp     d0, d1, [x]
+        extr    d2, d2, d1, #63
+        and     dd, d2, d3
+        extr    d1, d1, d0, #63
+        and     dd, dd, d1
+        lsl     d0, d0, #1
+
+// Let l = 4294968273 so that p_256k1 = 2^256 - l
+
+        mov     l, #977
+        orr     l, l, #0x100000000
+
+// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256
+
+        adds    xzr, d0, l
+        adcs    xzr, dd, xzr
+        adcs    c, c, xzr
+
+// Now c <> 0 <=> z >= p_256k1, so mask the constant l accordingly
+
+        csel    l, l, xzr, ne
+
+// If z >= p_256k1 do z := z - p_256k1, i.e. add l in 4 digits
+
+        adds    d0, d0, l
+        adcs    d1, d1, xzr
+        adcs    d2, d2, xzr
+        adc     d3, d3, xzr
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_half_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_half_p256k1.S
new file mode 100644
index 00000000000..70d9ced6f29
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_half_p256k1.S
@@ -0,0 +1,68 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_half_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_p256k1)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define c x6
+
+S2N_BN_SYMBOL(bignum_half_p256k1):
+
+// Load the 4 digits of x
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Let b be the LSB of the input (i.e. whether it is odd).
+// Create c = 4294968273 * b
+
+        mov     c, #977
+        orr     c, c, #0x100000000
+        tst     d0, #1
+        csel    c, c, xzr, ne
+
+// We want (x + b * p_256k1) / 2 where b is that LSB, in {0,1}.
+// That amounts to (2^256 * b + x - 4294968273 * b) / 2, and
+// modulo 4 words that's the same as ([2^256 * c + x] - c) / 2.
+// So do that subtraction and shift a place right as we go.
+
+        subs    d0, d0, c
+        sbcs    d1, d1, xzr
+        extr    d0, d1, d0, #1
+        sbcs    d2, d2, xzr
+        extr    d1, d2, d1, #1
+        sbcs    d3, d3, xzr
+        extr    d2, d3, d2, #1
+        sbc     c, c, xzr
+        extr    d3, c, d3, #1
+
+// Store back and return
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_n256k1_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_n256k1_4.S
new file mode 100644
index 00000000000..16109d8bb37
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_n256k1_4.S
@@ -0,0 +1,81 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_n256k1_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the group order of the secp256k1 curve.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256k1_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256k1_4)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define n0 x2
+#define n1 x3
+#define n2 x4
+#define n3 x5
+
+#define d0 x6
+#define d1 x7
+#define d2 x8
+#define d3 x9
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                              \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_mod_n256k1_4):
+
+// Load the complicated three words of n_256k1, the other being all 1s
+
+        movbig( n0, #0xbfd2, #0x5e8c, #0xd036, #0x4141)
+        movbig( n1, #0xbaae, #0xdce6, #0xaf48, #0xa03b)
+        mov     n2, 0xFFFFFFFFFFFFFFFE
+
+// Load the input number
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Do the subtraction. Since word 3 of n_256k1 is all 1s, that can be
+// done by adding zero with carry, thanks to the inverted carry.
+
+        subs    n0, d0, n0
+        sbcs    n1, d1, n1
+        sbcs    n2, d2, n2
+        adcs    n3, d3, xzr
+
+// Now if the carry is *clear* (inversion at work) the subtraction carried
+// and hence we should have done nothing, so we reset each n_i = d_i
+
+        csel    n0, d0, n0, cc
+        csel    n1, d1, n1, cc
+        csel    n2, d2, n2, cc
+        csel    n3, d3, n3, cc
+
+// Store the end result
+
+        stp     n0, n1, [z]
+        stp     n2, n3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_p256k1_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_p256k1_4.S
new file mode 100644
index 00000000000..6fb3ad133a8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_p256k1_4.S
@@ -0,0 +1,65 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_p256k1_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256k1_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256k1_4)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define d x6
+#define c x7
+
+S2N_BN_SYMBOL(bignum_mod_p256k1_4):
+
+// Load the inputs as [d3;d2;d1;d0] and let d be an AND of [d3;d2;d1] to
+// condense the comparison below.
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+        and     d, d1, d2
+        and     d, d, d3
+
+// Compare x >= p_256k1 = 2^256 - 4294968273 using condensed carry:
+// we get a carry from the lowest digit and all other digits are 1.
+// We end up with c and d as adjusted digits for x - p_256k1 if so.
+
+        mov     c, #977
+        orr     c, c, #0x100000000
+        adds    c, c, d0
+        adcs    d, d, xzr
+
+// If indeed x >= p_256k1 then x := x - p_256k1, using c and d
+
+        csel    d0, d0, c, cc
+        csel    d1, d1, d, cc
+        csel    d2, d2, d, cc
+        csel    d3, d3, d, cc
+
+// Store the end result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1.S
new file mode 100644
index 00000000000..50f0691d3ee
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1.S
@@ -0,0 +1,278 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_256k1, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_256k1 (in particular this is true if we are in
+// the "usual" case x < p_256k1 and y < p_256k1).
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256k1)
+        .text
+        .balign 4
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                              \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+// ---------------------------------------------------------------------------
+// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z)
+// c,h,l,t should all be different
+// t,h should not overlap w,z
+// ---------------------------------------------------------------------------
+
+#define muldiffn(c,h,l, t, x,y, w,z)    \
+        subs    t, x, y __LF               \
+        cneg    t, t, cc __LF              \
+        csetm   c, cc __LF                 \
+        subs    h, w, z __LF               \
+        cneg    h, h, cc __LF              \
+        mul     l, t, h __LF               \
+        umulh   h, t, h __LF               \
+        cinv    c, c, cc __LF              \
+        eor     l, l, c __LF               \
+        eor     h, h, c
+
+// ---------------------------------------------------------------------------
+// Core two-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d5;d4;d3;d2], modifying the
+// existing contents of [d3;d2;d1] and generating d4 and d5, while
+// using t1, t2, and t3 as temporaries. It's OK if d4 == d0 and d5 == d1.
+// ---------------------------------------------------------------------------
+
+#define montreds2(d5,d4,d3,d2,d1,d0)                                      \
+        movbig(t2, 0xd838, #0x091d, #0xd225, #0x3531) __LF           \
+        mul     d4, t2, d0 __LF                                      \
+        mov     t3, #977 __LF                                        \
+        orr     t3, t3, #0x100000000 __LF                            \
+        umulh   t1, d4, t3 __LF                                      \
+        subs    d1, d1, t1 __LF                                      \
+        mul     d5, t2, d1 __LF                                      \
+        umulh   t1, d5, t3 __LF                                      \
+        sbcs    d2, d2, t1 __LF                                      \
+        sbcs    d3, d3, xzr __LF                                     \
+        sbcs    d4, d4, xzr __LF                                     \
+        sbc     d5, d5, xzr
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define s0 x11
+#define s1 x12
+#define s2 x13
+#define s3 x14
+#define t0 x15
+#define t1 x16
+#define t2 x17
+#define t3 x1
+#define s4 x2
+
+S2N_BN_SYMBOL(bignum_montmul_p256k1):
+
+// Load in all words of both inputs
+
+        ldp     a0, a1, [x1]
+        ldp     a2, a3, [x1, #16]
+        ldp     b0, b1, [x2]
+        ldp     b2, b3, [x2, #16]
+
+// Multiply low halves with a 2x2->4 ADK multiplier as L = [s3;s2;s1;s0]
+
+        mul     s0, a0, b0
+        mul     s2, a1, b1
+        umulh   s1, a0, b0
+        adds    t1, s0, s2
+        umulh   s3, a1, b1
+        adcs    t2, s1, s3
+        adcs    s3, s3, xzr
+        adds    s1, s1, t1
+        adcs    s2, s2, t2
+        adcs    s3, s3, xzr
+        muldiffn(t3,t2,t1, t0, a0,a1, b1,b0)
+        adds    xzr, t3, #1
+        adcs    s1, s1, t1
+        adcs    s2, s2, t2
+        adc     s3, s3, t3
+
+// Perform two "short" Montgomery steps on the low product to
+// get a modified low result L' = [s1;s0;s3;s2]
+// This shifts it to an offset compatible with middle terms
+// Stash the result L' temporarily in the output buffer to avoid
+// using additional registers.
+
+        montreds2(s1,s0,s3,s2,s1,s0)
+
+        stp     s2, s3, [x0]
+        stp     s0, s1, [x0, #16]
+
+// Multiply high halves with a 2x2->4 ADK multiplier as H = [s3;s2;s1;s0]
+
+        mul     s0, a2, b2
+        mul     s2, a3, b3
+        umulh   s1, a2, b2
+        adds    t1, s0, s2
+        umulh   s3, a3, b3
+        adcs    t2, s1, s3
+        adcs    s3, s3, xzr
+        adds    s1, s1, t1
+        adcs    s2, s2, t2
+        adcs    s3, s3, xzr
+        muldiffn(t3,t2,t1, t0, a2,a3, b3,b2)
+        adds    xzr, t3, #1
+        adcs    s1, s1, t1
+        adcs    s2, s2, t2
+        adc     s3, s3, t3
+
+// Compute sign-magnitude a2,[a1,a0] = x_hi - x_lo
+
+        subs    a0, a2, a0
+        sbcs    a1, a3, a1
+        sbc     a2, xzr, xzr
+        adds    xzr, a2, #1
+        eor     a0, a0, a2
+        adcs    a0, a0, xzr
+        eor     a1, a1, a2
+        adcs    a1, a1, xzr
+
+// Compute sign-magnitude b2,[b1,b0] = y_lo - y_hi
+
+        subs    b0, b0, b2
+        sbcs    b1, b1, b3
+        sbc     b2, xzr, xzr
+        adds    xzr, b2, #1
+        eor     b0, b0, b2
+        adcs    b0, b0, xzr
+        eor     b1, b1, b2
+        adcs    b1, b1, xzr
+
+// Save the correct sign for the sub-product in b3
+
+        eor     b3, a2, b2
+
+// Add the high H to the modified low term L' as H + L' = [s4;b2;a2;t3;t0]
+
+        ldp     t0, t3, [x0]
+        adds    t0, s0, t0
+        adcs    t3, s1, t3
+        ldp     a2, b2, [x0, #16]
+        adcs    a2, s2, a2
+        adcs    b2, s3, b2
+        adc     s4, xzr, xzr
+
+// Multiply with yet a third 2x2->4 ADK multiplier for complex mid-term M
+
+        mul     s0, a0, b0
+        mul     s2, a1, b1
+        umulh   s1, a0, b0
+        adds    t1, s0, s2
+        umulh   s3, a1, b1
+        adcs    t2, s1, s3
+        adcs    s3, s3, xzr
+        adds    s1, s1, t1
+        adcs    s2, s2, t2
+        adcs    s3, s3, xzr
+        muldiffn(a1,t2,t1, a0, a0,a1, b1,b0)
+        adds    xzr, a1, #1
+        adcs    s1, s1, t1
+        adcs    s2, s2, t2
+        adc     s3, s3, a1
+
+// Set up a sign-modified version of the mid-product in a long accumulator
+// as [b3;a1;a0;s3;s2;s1;s0], adding in the H + L' term once with
+// zero offset as this signed value is created
+
+        adds    xzr, b3, #1
+        eor     s0, s0, b3
+        adcs    s0, s0, t0
+        eor     s1, s1, b3
+        adcs    s1, s1, t3
+        eor     s2, s2, b3
+        adcs    s2, s2, a2
+        eor     s3, s3, b3
+        adcs    s3, s3, b2
+        adcs    a0, s4, b3
+        adcs    a1, b3, xzr
+        adc     b3, b3, xzr
+
+// Add in the stashed H + L' term an offset of 2 words as well
+
+        adds    s2, s2, t0
+        adcs    s3, s3, t3
+        adcs    a0, a0, a2
+        adcs    a1, a1, b2
+        adc     b3, b3, s4
+
+// Do two more Montgomery steps on the composed term
+// Net pre-reduct is in [b3;a1;a0;s3;s2]
+
+        montreds2(s1,s0,s3,s2,s1,s0)
+
+// Finish addition and form condensed upper digits as "dd"
+
+#define dd b1
+
+        adds    a0, a0, s0
+        and     dd, s3, a0
+        adcs    a1, a1, s1
+        and     dd, dd, a1
+        adc     b3, b3, xzr
+
+// Because of the way we added L' in two places, we can overspill by
+// more than usual in Montgomery, with the result being only known to
+// be < 3 * p_256k1, not the usual < 2 * p_256k1. So now we do a more
+// elaborate final correction, making use of the condensed carry dd
+// to see if the initial estimate q = 4294968273 * (h + 1) results
+// in a negative true result, and if so use q = 4294968273 * h.
+
+#define d0 s2
+#define d1 s3
+#define d2 a0
+#define d3 a1
+#define h b3
+
+#define q s4
+#define c b0
+
+        madd    q, h, t3, t3
+
+        adds    xzr, d0, q
+        sub     h, q, t3
+        adcs    xzr, dd, xzr
+
+        csel    q, q, h, cs
+
+        adds    d0, d0, q
+        adcs    d1, d1, xzr
+        adcs    d2, d2, xzr
+        adc     d3, d3, xzr
+
+// Finally store the result
+
+        stp     d0, d1, [x0]
+        stp     d2, d3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1_alt.S
new file mode 100644
index 00000000000..0383075c54b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1_alt.S
@@ -0,0 +1,233 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_p256k1_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_256k1, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_256k1 (in particular this is true if we are in
+// the "usual" case x < p_256k1 and y < p_256k1).
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256k1_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define l x11
+
+#define u0 x12
+#define u1 x13
+#define u2 x14
+#define u3 x15
+#define u4 x16
+
+#define c x17
+
+// These alias to the input arguments when no longer needed
+
+#define u5 a0
+#define u6 a1
+#define u7 a2
+
+#define w x1
+#define t x2
+#define uu b3
+
+S2N_BN_SYMBOL(bignum_montmul_p256k1_alt):
+
+// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        mul     u0, a0, b0
+        umulh   u1, a0, b0
+        mul     l, a0, b1
+        umulh   u2, a0, b1
+        adds    u1, u1, l
+
+        ldp     b2, b3, [y, #16]
+
+        mul     l, a0, b2
+        umulh   u3, a0, b2
+        adcs    u2, u2, l
+
+        mul     l, a0, b3
+        umulh   u4, a0, b3
+        adcs    u3, u3, l
+        adc     u4, u4, xzr
+
+        ldp     a2, a3, [x, #16]
+
+// Start the Montgomery reductions now to interleave better, though
+// conceptually they all happen after the multiplication, only modifying
+// any u_i when the multiplication process no longer uses it. Set up
+// constants c = 4294968273 so that p_256k1 = 2^256 - c, and w the negated
+// multiplicative inverse so that p_256k1 * w == -1 (mod 2^64).
+
+        movz    w, #0x3531
+        movk    w, #0xd225, lsl #16
+        movk    w, #0x091d, lsl #32
+        movk    w, #0xd838, lsl #48
+        mov     c, #977
+        orr     c, c, #0x100000000
+
+// Precompute this part ahead of the main Montgomery stage. This
+// is a repeated pattern below, since it seems to slightly improve
+// dependent latencies.
+
+        mul     u0, w, u0
+
+// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0]
+
+        mul     l, a1, b0
+        adds    u1, u1, l
+        mul     l, a1, b1
+        adcs    u2, u2, l
+        mul     l, a1, b2
+        adcs    u3, u3, l
+        mul     l, a1, b3
+        adcs    u4, u4, l
+        umulh   u5, a1, b3
+        adc     u5, u5, xzr
+
+        umulh   l, a1, b0
+        adds    u2, u2, l
+        umulh   l, a1, b1
+        adcs    u3, u3, l
+        umulh   l, a1, b2
+        adcs    u4, u4, l
+        adc     u5, u5, xzr
+
+// Montgomery stage 0; use t to record the suspended carry
+
+        umulh   l, u0, c
+        subs    u1, u1, l
+        cset    t, cc
+
+// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0]
+
+        mul     l, a2, b0
+        adds    u2, u2, l
+        mul     l, a2, b1
+        adcs    u3, u3, l
+        mul     l, a2, b2
+        adcs    u4, u4, l
+        mul     l, a2, b3
+        adcs    u5, u5, l
+        umulh   u6, a2, b3
+        adc     u6, u6, xzr
+
+        mul     u1, w, u1
+
+        umulh   l, a2, b0
+        adds    u3, u3, l
+        umulh   l, a2, b1
+        adcs    u4, u4, l
+        umulh   l, a2, b2
+        adcs    u5, u5, l
+        adc     u6, u6, xzr
+
+// Montgomery stage 1
+
+        umulh   l, u1, c
+        add     l, l, t
+        subs    u2, u2, l
+        cset    t, cc
+
+// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0]
+
+        mul     l, a3, b0
+        adds    u3, u3, l
+        mul     l, a3, b1
+        adcs    u4, u4, l
+        mul     l, a3, b2
+        adcs    u5, u5, l
+        mul     l, a3, b3
+        adcs    u6, u6, l
+        umulh   u7, a3, b3
+        adc     u7, u7, xzr
+
+        mul     u2, w, u2
+
+        umulh   l, a3, b0
+        adds    u4, u4, l
+        umulh   l, a3, b1
+        adcs    u5, u5, l
+        umulh   l, a3, b2
+        adcs    u6, u6, l
+        adc     u7, u7, xzr
+
+// Montgomery stages 2 and 3 (no longer using t to link the carries).
+
+        umulh   l, u2, c
+        add     l, l, t
+        subs    u3, u3, l
+        mul     u3, w, u3
+        umulh   l, u3, c
+        sbcs    u0, u0, l
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+
+// Now if a * b = 2^256 * h + l is the full product, we now have
+// [u7;u6;u5;u4] = h and 2^256 * [u3;u2;u1;u0] == l (mod p_256k1) because
+// of the Montgomery reductions on the low half. Now add the high part
+// and the Montgomery-reduced low part.
+
+        adds    u0, u0, u4
+        adcs    u1, u1, u5
+        adcs    u2, u2, u6
+        and     uu, u1, u2
+        adcs    u3, u3, u7
+        and     uu, uu, u3
+        cset    t, cs
+
+// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256
+
+        adds    xzr, u0, c
+        adcs    xzr, uu, xzr
+        adcs    t, t, xzr
+
+// Now t <> 0 <=> z >= p_256k1, so mask the constant c accordingly
+
+        csel    c, c, xzr, ne
+
+// If z >= p_256k1 do z := z - p_256k1, i.e. add c in 4 digits
+
+        adds    u0, u0, c
+        adcs    u1, u1, xzr
+        adcs    u2, u2, xzr
+        adc     u3, u3, xzr
+
+// Write back
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1.S
new file mode 100644
index 00000000000..1fe1f12a680
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1.S
@@ -0,0 +1,183 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_256k1, assuming x^2 <= 2^256 * p_256k1, which
+// is guaranteed in particular if x < p_256k1 initially (the "intended" case).
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256k1)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+// Variables
+
+#define u0 x2
+#define u1 x3
+#define u2 x4
+#define u3 x5
+#define u4 x6
+#define u5 x7
+#define u6 x8
+#define u7 x9
+
+#define w x10
+#define c x11
+#define t x12
+#define uu x13
+
+S2N_BN_SYMBOL(bignum_montsqr_p256k1):
+
+// First just a near-clone of bignum_sqr_4_8 to get the square, using
+// different registers to collect full product without writeback.
+
+        ldp     u4, u5, [x]
+        ldp     x10, x11, [x, #16]
+        mul     u2, u4, x10
+        mul     u7, u5, x11
+        umulh   x12, u4, x10
+        subs    x13, u4, u5
+        cneg    x13, x13, cc
+        csetm   u1, cc
+        subs    u0, x11, x10
+        cneg    u0, u0, cc
+        mul     u6, x13, u0
+        umulh   u0, x13, u0
+        cinv    u1, u1, cc
+        eor     u6, u6, u1
+        eor     u0, u0, u1
+        adds    u3, u2, x12
+        adc     x12, x12, xzr
+        umulh   x13, u5, x11
+        adds    u3, u3, u7
+        adcs    x12, x12, x13
+        adc     x13, x13, xzr
+        adds    x12, x12, u7
+        adc     x13, x13, xzr
+        cmn     u1, #0x1
+        adcs    u3, u3, u6
+        adcs    x12, x12, u0
+        adc     x13, x13, u1
+        adds    u2, u2, u2
+        adcs    u3, u3, u3
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x14, xzr, xzr
+        mul     u0, u4, u4
+        mul     u6, u5, u5
+        mul     x15, u4, u5
+        umulh   u1, u4, u4
+        umulh   u7, u5, u5
+        umulh   x16, u4, u5
+        adds    u1, u1, x15
+        adcs    u6, u6, x16
+        adc     u7, u7, xzr
+        adds    u1, u1, x15
+        adcs    u6, u6, x16
+        adc     u7, u7, xzr
+        adds    u2, u2, u6
+        adcs    u3, u3, u7
+        adcs    x12, x12, xzr
+        adcs    x13, x13, xzr
+        adc     x14, x14, xzr
+        mul     u4, x10, x10
+        mul     u6, x11, x11
+        mul     x15, x10, x11
+        umulh   u5, x10, x10
+        umulh   u7, x11, x11
+        umulh   x16, x10, x11
+        adds    u5, u5, x15
+        adcs    u6, u6, x16
+        adc     u7, u7, xzr
+        adds    u5, u5, x15
+        adcs    u6, u6, x16
+        adc     u7, u7, xzr
+        adds    u4, u4, x12
+        adcs    u5, u5, x13
+        adcs    u6, u6, x14
+        adc     u7, u7, xzr
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]. Set up constants
+// c = 4294968273 so that p_256k1 = 2^256 - c, and w the negated
+// multiplicative inverse so that p_256k1 * w == -1 (mod 2^64).
+
+        movz    w, #0x3531
+        movk    w, #0xd225, lsl #16
+        movk    w, #0x091d, lsl #32
+        movk    w, #0xd838, lsl #48
+        mov     c, #977
+        orr     c, c, #0x100000000
+
+// Do 4 iterations of Montgomery reduction, rotating [u3;u2;u1;u0]
+
+        mul     u0, w, u0
+        umulh   t, u0, c
+        subs    u1, u1, t
+
+        mul     u1, w, u1
+        umulh   t, u1, c
+        sbcs    u2, u2, t
+
+        mul     u2, w, u2
+        umulh   t, u2, c
+        sbcs    u3, u3, t
+
+        mul     u3, w, u3
+        umulh   t, u3, c
+        sbcs    u0, u0, t
+
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+
+// Add the high part and the Montgomery reduced low part
+
+        adds    u0, u0, u4
+        adcs    u1, u1, u5
+        adcs    u2, u2, u6
+        and     uu, u1, u2
+        adcs    u3, u3, u7
+        and     uu, uu, u3
+        cset    t, cs
+
+// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256
+
+        adds    xzr, u0, c
+        adcs    xzr, uu, xzr
+        adcs    t, t, xzr
+
+// Now t <> 0 <=> z >= p_256k1, so mask the constant c accordingly
+
+        csel    c, c, xzr, ne
+
+// If z >= p_256k1 do z := z - p_256k1, i.e. add c in 4 digits
+
+        adds    u0, u0, c
+        adcs    u1, u1, xzr
+        adcs    u2, u2, xzr
+        adc     u3, u3, xzr
+
+// Write back
+
+        stp     u0, u1, [x0]
+        stp     u2, u3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1_alt.S
new file mode 100644
index 00000000000..f4d141c9256
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1_alt.S
@@ -0,0 +1,194 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_p256k1_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_256k1, assuming x^2 <= 2^256 * p_256k1, which
+// is guaranteed in particular if x < p_256k1 initially (the "intended" case).
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256k1_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+
+#define h x6
+#define l x7
+
+#define u0 x8
+#define u1 x9
+#define u2 x10
+#define u3 x11
+#define u4 x12
+#define u5 x13
+#define u6 x14
+#define u7 x15
+
+// Just aliases (we only use w after loading the inputs)
+
+#define w x
+#define t h
+#define c a0
+#define uu a1
+
+S2N_BN_SYMBOL(bignum_montsqr_p256k1_alt):
+
+// Load all the elements, set up an initial window [u6;...u1] = [23;03;01]
+// and chain in the addition of 02 + 12 + 13 (no carry-out is possible).
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        ldp     a0, a1, [x]
+
+        mul     u1, a0, a1
+        umulh   u2, a0, a1
+
+        ldp     a2, a3, [x, #16]
+
+        mul     u3, a0, a3
+        umulh   u4, a0, a3
+
+        mul     l, a0, a2
+        umulh   h, a0, a2
+        adds    u2, u2, l
+
+        adcs    u3, u3, h
+        mul     l, a1, a2
+        umulh   h, a1, a2
+        adc     h, h, xzr
+        adds    u3, u3, l
+
+        mul     u5, a2, a3
+        umulh   u6, a2, a3
+
+        adcs    u4, u4, h
+        mul     l, a1, a3
+        umulh   h, a1, a3
+        adc     h, h, xzr
+        adds    u4, u4, l
+
+        adcs    u5, u5, h
+        adc     u6, u6, xzr
+
+// Now just double it; this simple approach seems to work better than extr
+
+        adds    u1, u1, u1
+        adcs    u2, u2, u2
+        adcs    u3, u3, u3
+        adcs    u4, u4, u4
+        adcs    u5, u5, u5
+        adcs    u6, u6, u6
+        cset    u7, cs
+
+// Add the homogeneous terms 00 + 11 + 22 + 33
+
+        umulh   l, a0, a0
+        mul     u0, a0, a0
+        adds    u1, u1, l
+
+// Start the Montgomery reductions now to interleave better, though
+// conceptually they all happen after the multiplication, only modifying
+// any u_i when the multiplication process no longer uses it. Set up
+// constants c = 4294968273 so that p_256k1 = 2^256 - c, and w the negated
+// multiplicative inverse so that p_256k1 * w == -1 (mod 2^64).
+// Precompute a little ahead of the main Montgomery stage.
+
+        movz    w, #0x3531
+        movk    w, #0xd225, lsl #16
+        movk    w, #0x091d, lsl #32
+        movk    w, #0xd838, lsl #48
+        mov     c, #977
+        orr     c, c, #0x100000000
+        mul     u0, w, u0
+
+        mul     l, a1, a1
+        adcs    u2, u2, l
+        umulh   l, a1, a1
+        adcs    u3, u3, l
+
+        mul     l, a2, a2
+        adcs    u4, u4, l
+        umulh   l, a2, a2
+        adcs    u5, u5, l
+
+        mul     l, a3, a3
+        adcs    u6, u6, l
+        umulh   l, a3, a3
+        adc     u7, u7, l
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0']. We actually precomputed
+// the Montgomery multiplier in u0, but otherwise continue with
+// 4 iterations of Montgomery reduction, rotating [u3;u2;u1;u0]
+
+        umulh   l, u0, c
+        subs    u1, u1, l
+
+        mul     u1, w, u1
+        umulh   l, u1, c
+        sbcs    u2, u2, l
+
+        mul     u2, w, u2
+        umulh   l, u2, c
+        sbcs    u3, u3, l
+
+        mul     u3, w, u3
+        umulh   l, u3, c
+        sbcs    u0, u0, l
+
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+
+// Add the high part and the Montgomery reduced low part
+
+        adds    u0, u0, u4
+        adcs    u1, u1, u5
+        adcs    u2, u2, u6
+        and     uu, u1, u2
+        adcs    u3, u3, u7
+        and     uu, uu, u3
+        cset    t, cs
+
+// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256
+
+        adds    xzr, u0, c
+        adcs    xzr, uu, xzr
+        adcs    t, t, xzr
+
+// Now t <> 0 <=> z >= p_256k1, so mask the constant c accordingly
+
+        csel    c, c, xzr, ne
+
+// If z >= p_256k1 do z := z - p_256k1, i.e. add c in 4 digits
+
+        adds    u0, u0, c
+        adcs    u1, u1, xzr
+        adcs    u2, u2, xzr
+        adc     u3, u3, xzr
+
+// Write back
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1.S
new file mode 100644
index 00000000000..6b2b5aee802
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1.S
@@ -0,0 +1,302 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply modulo p_256k1, z := (x * y) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_mul_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p256k1)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a1 x4
+#define b0 x5
+#define b1 x6
+
+#define u0 x7
+#define u1 x8
+#define u2 x9
+#define u3 x10
+#define u4 x11
+#define u5 x12
+#define u6 x13
+#define u7 x14
+
+#define t  x15
+
+#define sgn x16
+#define ysgn x17
+
+// These are aliases to registers used elsewhere including input pointers.
+// By the time they are used this does not conflict with other uses.
+
+#define m0 y
+#define m1 ysgn
+#define m2 t
+#define m3 x
+#define u u2
+
+// For the reduction stages, again aliasing other things
+
+#define c x1
+#define h x2
+#define l x15
+#define d x16
+#define q x17
+#define a2 x11
+#define a3 x12
+#define b2 x13
+#define b3 x14
+
+S2N_BN_SYMBOL(bignum_mul_p256k1):
+
+// Multiply the low halves using Karatsuba 2x2->4 to get [u3,u2,u1,u0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        mul     u0, a0, b0
+        umulh   u1, a0, b0
+        mul     u2, a1, b1
+        umulh   u3, a1, b1
+
+        subs    a1, a1, a0
+        cneg    a1, a1, cc
+        csetm   sgn, cc
+
+        adds    u2, u2, u1
+        adc     u3, u3, xzr
+
+        subs    a0, b0, b1
+        cneg    a0, a0, cc
+        cinv    sgn, sgn, cc
+
+        mul     t, a1, a0
+        umulh   a0, a1, a0
+
+        adds    u1, u0, u2
+        adcs    u2, u2, u3
+        adc     u3, u3, xzr
+
+        adds    xzr, sgn, #1
+        eor     t, t, sgn
+        adcs    u1, t, u1
+        eor     a0, a0, sgn
+        adcs    u2, a0, u2
+        adc     u3, u3, sgn
+
+// Multiply the high halves using Karatsuba 2x2->4 to get [u7,u6,u5,u4]
+
+        ldp     a0, a1, [x, #16]
+        ldp     b0, b1, [y, #16]
+
+        mul     u4, a0, b0
+        umulh   u5, a0, b0
+        mul     u6, a1, b1
+        umulh   u7, a1, b1
+
+        subs    a1, a1, a0
+        cneg    a1, a1, cc
+        csetm   sgn, cc
+
+        adds    u6, u6, u5
+        adc     u7, u7, xzr
+
+        subs    a0, b0, b1
+        cneg    a0, a0, cc
+        cinv    sgn, sgn, cc
+
+        mul     t, a1, a0
+        umulh   a0, a1, a0
+
+        adds    u5, u4, u6
+        adcs    u6, u6, u7
+        adc     u7, u7, xzr
+
+        adds    xzr, sgn, #1
+        eor     t, t, sgn
+        adcs    u5, t, u5
+        eor     a0, a0, sgn
+        adcs    u6, a0, u6
+        adc     u7, u7, sgn
+
+// Compute  sgn,[a1,a0] = x_hi - x_lo
+// and     ysgn,[b1,b0] = y_lo - y_hi
+// sign-magnitude differences
+
+        ldp     a0, a1, [x, #16]
+        ldp     t, sgn, [x]
+        subs    a0, a0, t
+        sbcs    a1, a1, sgn
+        csetm   sgn, cc
+
+        ldp     t, ysgn, [y]
+        subs    b0, t, b0
+        sbcs    b1, ysgn, b1
+        csetm   ysgn, cc
+
+        eor     a0, a0, sgn
+        subs    a0, a0, sgn
+        eor     a1, a1, sgn
+        sbc     a1, a1, sgn
+
+        eor     b0, b0, ysgn
+        subs    b0, b0, ysgn
+        eor     b1, b1, ysgn
+        sbc     b1, b1, ysgn
+
+// Save the correct sign for the sub-product
+
+        eor     sgn, ysgn, sgn
+
+// Add H' = H + L_top, still in [u7,u6,u5,u4]
+
+        adds    u4, u4, u2
+        adcs    u5, u5, u3
+        adcs    u6, u6, xzr
+        adc     u7, u7, xzr
+
+// Now compute the mid-product as [m3,m2,m1,m0]
+
+        mul     m0, a0, b0
+        umulh   m1, a0, b0
+        mul     m2, a1, b1
+        umulh   m3, a1, b1
+
+        subs    a1, a1, a0
+        cneg    a1, a1, cc
+        csetm   u, cc
+
+        adds    m2, m2, m1
+        adc     m3, m3, xzr
+
+        subs    b1, b0, b1
+        cneg    b1, b1, cc
+        cinv    u, u, cc
+
+        mul     b0, a1, b1
+        umulh   b1, a1, b1
+
+        adds    m1, m0, m2
+        adcs    m2, m2, m3
+        adc     m3, m3, xzr
+
+        adds    xzr, u, #1
+        eor     b0, b0, u
+        adcs    m1, b0, m1
+        eor     b1, b1, u
+        adcs    m2, b1, m2
+        adc     m3, m3, u
+
+// Accumulate the positive mid-terms as [u7,u6,u5,u4,u3,u2]
+
+        adds    u2, u4, u0
+        adcs    u3, u5, u1
+        adcs    u4, u6, u4
+        adcs    u5, u7, u5
+        adcs    u6, u6, xzr
+        adc     u7, u7, xzr
+
+// Add in the sign-adjusted complex term
+
+        adds    xzr, sgn, #1
+        eor     m0, m0, sgn
+        adcs    u2, m0, u2
+        eor     m1, m1, sgn
+        adcs    u3, m1, u3
+        eor     m2, m2, sgn
+        adcs    u4, m2, u4
+        eor     m3, m3, sgn
+        adcs    u5, m3, u5
+        adcs    u6, u6, sgn
+        adc     u7, u7, sgn
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
+// and this is == 4294968273 * h + l (mod p_256k1)
+// Some of the word products are done straightforwardly using mul + umulh
+// while others are broken down in a more complicated way as
+// (2^32 + 977) * (2^32 * h + l) = 2^64 * h + 2^32 * (d * h + l) + d * l
+
+        mov     d, #977
+        orr     c, d, #0x100000000
+
+        mul     a0, c, u4
+        umulh   b0, c, u4
+
+        and     l, u5, #0xFFFFFFFF
+        lsr     h, u5, #32
+        mul     a1, d, l
+        madd    l, d, h, l
+        adds    a1, a1, l, lsl #32
+        lsr     l, l, #32
+        adc     b1, h, l
+
+        mul     a2, c, u6
+        umulh   b2, c, u6
+
+        and     l, u7, #0xFFFFFFFF
+        lsr     h, u7, #32
+        mul     a3, d, l
+        madd    l, d, h, l
+        adds    a3, a3, l, lsl #32
+        lsr     l, l, #32
+        adc     b3, h, l
+
+        adds    u0, u0, a0
+        adcs    u1, u1, a1
+        adcs    u2, u2, a2
+        adcs    u3, u3, a3
+        cset    u4, cs
+
+        adds    u1, u1, b0
+        adcs    u2, u2, b1
+        adcs    u3, u3, b2
+        adc     u4, u4, b3
+
+// Now we have reduced to 5 digits, 2^256 * h + l = [u4,u3,u2,u1,u0]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+// Since q <= 2^33 we do 4294968273 * q = (q<<32) + 977 * q to avoid umulh
+
+        add     q, u4, #1
+        mul     a0, d, q
+        lsr     a1, q, #32
+        adds    a0, a0, q, lsl #32
+        adc     a1, xzr, a1
+        adds    u0, u0, a0
+        adcs    u1, u1, a1
+        adcs    u2, u2, xzr
+        adcs    u3, u3, xzr
+
+// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0]
+// So we correct if CF = 0 by subtracting 4294968273, i.e. by
+// adding p_256k1 to the "full" answer
+
+        csel    c, c, xzr, cc
+        subs    u0, u0, c
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+
+// Write back and return
+
+        stp     u0, u1, [x0]
+        stp     u2, u3, [x0, #16]
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1_alt.S
new file mode 100644
index 00000000000..5ed08227792
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1_alt.S
@@ -0,0 +1,199 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply modulo p_256k1, z := (x * y) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_mul_p256k1_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p256k1_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define l x11
+
+#define u0 x12
+#define u1 x13
+#define u2 x14
+#define u3 x15
+#define u4 x16
+
+// These alias to the input arguments when no longer needed
+
+#define u5 a0
+#define u6 a1
+#define u7 a2
+
+#define c b0
+#define q b1
+#define h b2
+
+S2N_BN_SYMBOL(bignum_mul_p256k1_alt):
+
+// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        mul     u0, a0, b0
+        umulh   u1, a0, b0
+        mul     l, a0, b1
+        umulh   u2, a0, b1
+        adds    u1, u1, l
+
+        ldp     b2, b3, [y, #16]
+
+        mul     l, a0, b2
+        umulh   u3, a0, b2
+        adcs    u2, u2, l
+
+        mul     l, a0, b3
+        umulh   u4, a0, b3
+        adcs    u3, u3, l
+        adc     u4, u4, xzr
+
+        ldp     a2, a3, [x, #16]
+
+// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0]
+
+        mul     l, a1, b0
+        adds    u1, u1, l
+        mul     l, a1, b1
+        adcs    u2, u2, l
+        mul     l, a1, b2
+        adcs    u3, u3, l
+        mul     l, a1, b3
+        adcs    u4, u4, l
+        umulh   u5, a1, b3
+        adc     u5, u5, xzr
+
+        umulh   l, a1, b0
+        adds    u2, u2, l
+        umulh   l, a1, b1
+        adcs    u3, u3, l
+        umulh   l, a1, b2
+        adcs    u4, u4, l
+        adc     u5, u5, xzr
+
+// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0]
+
+        mul     l, a2, b0
+        adds    u2, u2, l
+        mul     l, a2, b1
+        adcs    u3, u3, l
+        mul     l, a2, b2
+        adcs    u4, u4, l
+        mul     l, a2, b3
+        adcs    u5, u5, l
+        umulh   u6, a2, b3
+        adc     u6, u6, xzr
+
+        umulh   l, a2, b0
+        adds    u3, u3, l
+        umulh   l, a2, b1
+        adcs    u4, u4, l
+        umulh   l, a2, b2
+        adcs    u5, u5, l
+        adc     u6, u6, xzr
+
+// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0]
+
+        mul     l, a3, b0
+        adds    u3, u3, l
+        mul     l, a3, b1
+        adcs    u4, u4, l
+        mul     l, a3, b2
+        adcs    u5, u5, l
+        mul     l, a3, b3
+        adcs    u6, u6, l
+        umulh   u7, a3, b3
+        adc     u7, u7, xzr
+
+        umulh   l, a3, b0
+        adds    u4, u4, l
+        umulh   l, a3, b1
+        adcs    u5, u5, l
+        umulh   l, a3, b2
+        adcs    u6, u6, l
+        adc     u7, u7, xzr
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
+// and this is == 4294968273 * h + l (mod p_256k1)
+
+        mov     c, #977
+        orr     c, c, #0x100000000
+
+        mul     l, c, u4
+        umulh   h, c, u4
+        adds    u0, u0, l
+
+        mul     l, c, u5
+        umulh   u5, c, u5
+        adcs    u1, u1, l
+
+        mul     l, c, u6
+        umulh   u6, c, u6
+        adcs    u2, u2, l
+
+        mul     l, c, u7
+        umulh   u7, c, u7
+        adcs    u3, u3, l
+        cset    u4, cs
+
+        adds    u1, u1, h
+        adcs    u2, u2, u5
+        adcs    u3, u3, u6
+        adc     u4, u4, u7
+
+// Now we have reduced to 5 digits, 2^256 * h + l = [u4,u3,u2,u1,u0]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+
+        add     q, u4, #1
+        mul     l, c, q
+        umulh   h, c, q
+        adds    u0, u0, l
+        adcs    u1, u1, h
+        adcs    u2, u2, xzr
+        adcs    u3, u3, xzr
+
+// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0]
+// So we correct if CF = 0 by subtracting 4294968273, i.e. by
+// adding p_256k1 to the "full" answer
+
+        csel    c, c, xzr, cc
+        subs    u0, u0, c
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+
+// Write back and return
+
+        stp     u0, u1, [x0]
+        stp     u2, u3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_neg_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_neg_p256k1.S
new file mode 100644
index 00000000000..d0749fac9e4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_neg_p256k1.S
@@ -0,0 +1,65 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_neg_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_p256k1)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define p x2
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define c x7
+
+S2N_BN_SYMBOL(bignum_neg_p256k1):
+
+// Load the 4 digits of x and let c be an OR of all the digits
+
+        ldp     d0, d1, [x]
+        orr     c, d0, d1
+        ldp     d2, d3, [x, #16]
+        orr     c, c, d2
+        orr     c, c, d3
+
+// Turn q into a strict bitmask, and c a masked constant -4294968273,
+// computing it in effect as ~4294968272 = ~(2^32 + 976)
+
+        cmp     c, xzr
+        csetm   p, ne
+        mov     c, #976
+        orr     c, c, #0x100000000
+        bic     c, p, c
+
+// Now just do [2^256 - 4294968273] - x where the constant is masked
+
+        subs    d0, c, d0
+        sbcs    d1, p, d1
+        sbcs    d2, p, d2
+        sbc     d3, p, d3
+
+// Write back result and return
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_optneg_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_optneg_p256k1.S
new file mode 100644
index 00000000000..86f8dd5e177
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_optneg_p256k1.S
@@ -0,0 +1,74 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or
+// z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+//
+//    extern void bignum_optneg_p256k1
+//     (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = p, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p256k1)
+        .text
+        .balign 4
+
+#define z x0
+#define p x1
+#define x x2
+
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define c x7
+
+S2N_BN_SYMBOL(bignum_optneg_p256k1):
+
+// Load the 4 digits of x and let c be an OR of all the digits
+
+        ldp     d0, d1, [x]
+        orr     c, d0, d1
+        ldp     d2, d3, [x, #16]
+        orr     c, c, d2
+        orr     c, c, d3
+
+// Turn p into a strict bitmask. Force it to zero if the input is zero,
+// to avoid giving -0 = p_256k1, which is not reduced though correct modulo.
+
+        cmp     p, xzr
+        csetm   p, ne
+        cmp     c, xzr
+        csel    p, xzr, p, eq
+
+// We want z := if p then (2^256 - 4294968273) - x else x
+// which is: [if p then ~x else x] - [if p then 4294968272 else 0]
+
+        mov     c, #976
+        orr     c, c, #0x100000000
+        and     c, c, p
+
+        eor     d0, d0, p
+        subs    d0, d0, c
+        eor     d1, d1, p
+        sbcs    d1, d1, xzr
+        eor     d2, d2, p
+        sbcs    d2, d2, xzr
+        eor     d3, d3, p
+        sbc     d3, d3, xzr
+
+// Write back result and return
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1.S
new file mode 100644
index 00000000000..b579acef398
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1.S
@@ -0,0 +1,223 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square modulo p_256k1, z := (x^2) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_sqr_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p256k1)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+// Variables
+
+#define u0 x2
+#define u1 x3
+#define u2 x4
+#define u3 x5
+#define u4 x6
+#define u5 x7
+#define u6 x8
+#define u7 x9
+
+#define a0 x10
+#define a1 x11
+#define a2 x12
+#define b0 x13
+#define b1 x14
+#define b3 x15
+#define c x16
+#define d x17
+
+// Some additional aliases
+
+#define l u4
+#define h u5
+#define b2 u6
+#define q u4
+#define a3 u7
+
+S2N_BN_SYMBOL(bignum_sqr_p256k1):
+
+// First just a near-clone of bignum_sqr_4_8 to get the square, using
+// different registers to collect full product without writeback.
+
+        ldp     x10, x11, [x1]
+        ldp     x12, x13, [x1, #16]
+        umull   x2, w10, w10
+        lsr     x14, x10, #32
+        umull   x3, w14, w14
+        umull   x14, w10, w14
+        adds    x2, x2, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x3, x3, x14
+        umull   x4, w11, w11
+        lsr     x14, x11, #32
+        umull   x5, w14, w14
+        umull   x14, w11, w14
+        mul     x15, x10, x11
+        umulh   x16, x10, x11
+        adds    x4, x4, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x5, x5, x14
+        adds    x15, x15, x15
+        adcs    x16, x16, x16
+        adc     x5, x5, xzr
+        adds    x3, x3, x15
+        adcs    x4, x4, x16
+        adc     x5, x5, xzr
+        umull   x6, w12, w12
+        lsr     x14, x12, #32
+        umull   x7, w14, w14
+        umull   x14, w12, w14
+        adds    x6, x6, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x7, x7, x14
+        umull   x8, w13, w13
+        lsr     x14, x13, #32
+        umull   x9, w14, w14
+        umull   x14, w13, w14
+        mul     x15, x12, x13
+        umulh   x16, x12, x13
+        adds    x8, x8, x14, lsl #33
+        lsr     x14, x14, #31
+        adc     x9, x9, x14
+        adds    x15, x15, x15
+        adcs    x16, x16, x16
+        adc     x9, x9, xzr
+        adds    x7, x7, x15
+        adcs    x8, x8, x16
+        adc     x9, x9, xzr
+        subs    x10, x10, x12
+        sbcs    x11, x11, x13
+        csetm   x16, cc
+        eor     x10, x10, x16
+        subs    x10, x10, x16
+        eor     x11, x11, x16
+        sbc     x11, x11, x16
+        adds    x6, x6, x4
+        adcs    x7, x7, x5
+        adcs    x8, x8, xzr
+        adc     x9, x9, xzr
+        umull   x12, w10, w10
+        lsr     x5, x10, #32
+        umull   x13, w5, w5
+        umull   x5, w10, w5
+        adds    x12, x12, x5, lsl #33
+        lsr     x5, x5, #31
+        adc     x13, x13, x5
+        umull   x15, w11, w11
+        lsr     x5, x11, #32
+        umull   x14, w5, w5
+        umull   x5, w11, w5
+        mul     x4, x10, x11
+        umulh   x16, x10, x11
+        adds    x15, x15, x5, lsl #33
+        lsr     x5, x5, #31
+        adc     x14, x14, x5
+        adds    x4, x4, x4
+        adcs    x16, x16, x16
+        adc     x14, x14, xzr
+        adds    x13, x13, x4
+        adcs    x15, x15, x16
+        adc     x14, x14, xzr
+        adds    x4, x2, x6
+        adcs    x5, x3, x7
+        adcs    x6, x6, x8
+        adcs    x7, x7, x9
+        csetm   x16, cc
+        subs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x15
+        sbcs    x7, x7, x14
+        adcs    x8, x8, x16
+        adc     x9, x9, x16
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
+// and this is == 4294968273 * h + l (mod p_256k1)
+// Some of the word products are done straightforwardly using mul + umulh
+// while others are broken down in a more complicated way as
+// (2^32 + 977) * (2^32 * h + l) = 2^64 * h + 2^32 * (d * h + l) + d * l
+
+        mov     d, #977
+        orr     c, d, #0x100000000
+
+        mul     a0, c, u4
+        umulh   b0, c, u4
+
+        and     l, u5, #0xFFFFFFFF
+        lsr     h, u5, #32
+        mul     a1, d, l
+        madd    l, d, h, l
+        adds    a1, a1, l, lsl #32
+        lsr     l, l, #32
+        adc     b1, h, l
+
+        mul     a2, c, u6
+        umulh   b2, c, u6
+
+        and     l, u7, #0xFFFFFFFF
+        lsr     h, u7, #32
+        mul     a3, d, l
+        madd    l, d, h, l
+        adds    a3, a3, l, lsl #32
+        lsr     l, l, #32
+        adc     b3, h, l
+
+        adds    u0, u0, a0
+        adcs    u1, u1, a1
+        adcs    u2, u2, a2
+        adcs    u3, u3, a3
+        cset    u4, cs
+
+        adds    u1, u1, b0
+        adcs    u2, u2, b1
+        adcs    u3, u3, b2
+        adc     u4, u4, b3
+
+// Now we have reduced to 5 digits, 2^256 * h + l = [u4,u3,u2,u1,u0]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+// Since q <= 2^33 we do 4294968273 * q = (q<<32) + 977 * q to avoid umulh
+
+        add     q, u4, #1
+        mul     a0, d, q
+        lsr     a1, q, #32
+        adds    a0, a0, q, lsl #32
+        adc     a1, xzr, a1
+        adds    u0, u0, a0
+        adcs    u1, u1, a1
+        adcs    u2, u2, xzr
+        adcs    u3, u3, xzr
+
+// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0]
+// So we correct if CF = 0 by subtracting 4294968273, i.e. by
+// adding p_256k1 to the "full" answer
+
+        csel    c, c, xzr, cc
+        subs    u0, u0, c
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+
+// Write back
+
+        stp     u0, u1, [x0]
+        stp     u2, u3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1_alt.S
new file mode 100644
index 00000000000..3565e48300f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1_alt.S
@@ -0,0 +1,174 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square modulo p_256k1, z := (x^2) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_sqr_p256k1_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p256k1_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+
+#define h x6
+#define l x7
+
+#define u0 x8
+#define u1 x9
+#define u2 x10
+#define u3 x11
+#define u4 x12
+#define u5 x13
+#define u6 x14
+
+// Just aliases
+
+#define q a0
+#define c a1
+#define t a2
+#define u7 h
+
+S2N_BN_SYMBOL(bignum_sqr_p256k1_alt):
+
+// Load all the elements, set up an initial window [u6;...u1] = [23;03;01]
+// and chain in the addition of 02 + 12 + 13 (no carry-out is possible).
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        ldp     a0, a1, [x]
+
+        mul     u1, a0, a1
+        umulh   u2, a0, a1
+
+        ldp     a2, a3, [x, #16]
+
+        mul     u3, a0, a3
+        umulh   u4, a0, a3
+
+        mul     l, a0, a2
+        umulh   h, a0, a2
+        adds    u2, u2, l
+
+        adcs    u3, u3, h
+        mul     l, a1, a2
+        umulh   h, a1, a2
+        adc     h, h, xzr
+        adds    u3, u3, l
+
+        mul     u5, a2, a3
+        umulh   u6, a2, a3
+
+        adcs    u4, u4, h
+        mul     l, a1, a3
+        umulh   h, a1, a3
+        adc     h, h, xzr
+        adds    u4, u4, l
+
+        adcs    u5, u5, h
+        adc     u6, u6, xzr
+
+// Now just double it; this simple approach seems to work better than extr
+
+        adds    u1, u1, u1
+        adcs    u2, u2, u2
+        adcs    u3, u3, u3
+        adcs    u4, u4, u4
+        adcs    u5, u5, u5
+        adcs    u6, u6, u6
+        cset    u7, cs
+
+// Add the homogeneous terms 00 + 11 + 22 + 33
+
+        umulh   l, a0, a0
+        mul     u0, a0, a0
+        adds    u1, u1, l
+
+        mul     l, a1, a1
+        adcs    u2, u2, l
+        umulh   l, a1, a1
+        adcs    u3, u3, l
+
+        mul     l, a2, a2
+        adcs    u4, u4, l
+        umulh   l, a2, a2
+        adcs    u5, u5, l
+
+        mul     l, a3, a3
+        adcs    u6, u6, l
+        umulh   l, a3, a3
+        adc     u7, u7, l
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]
+// and this is == 4294968273 * h + l (mod p_256k1)
+
+        mov     c, #977
+        orr     c, c, #0x100000000
+
+        mul     l, c, u4
+        umulh   t, c, u4
+        adds    u0, u0, l
+
+        mul     l, c, u5
+        umulh   u5, c, u5
+        adcs    u1, u1, l
+
+        mul     l, c, u6
+        umulh   u6, c, u6
+        adcs    u2, u2, l
+
+        mul     l, c, u7
+        umulh   u7, c, u7
+        adcs    u3, u3, l
+        cset    u4, cs
+
+        adds    u1, u1, t
+        adcs    u2, u2, u5
+        adcs    u3, u3, u6
+        adc     u4, u4, u7
+
+// Now we have reduced to 5 digits, 2^256 * h + l = [u4,u3,u2,u1,u0]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+
+        add     q, u4, #1
+        mul     l, c, q
+        umulh   h, c, q
+        adds    u0, u0, l
+        adcs    u1, u1, h
+        adcs    u2, u2, xzr
+        adcs    u3, u3, xzr
+
+// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0]
+// So we correct if CF = 0 by subtracting 4294968273, i.e. by
+// adding p_256k1 to the "full" answer
+
+        csel    c, c, xzr, cc
+        subs    u0, u0, c
+        sbcs    u1, u1, xzr
+        sbcs    u2, u2, xzr
+        sbc     u3, u3, xzr
+
+// Write back and return
+
+        stp     u0, u1, [x0]
+        stp     u2, u3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sub_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sub_p256k1.S
new file mode 100644
index 00000000000..b291a529358
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sub_p256k1.S
@@ -0,0 +1,68 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo p_256k1, z := (x - y) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_sub_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p256k1)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+#define c x3
+#define l x4
+#define d0 x5
+#define d1 x6
+#define d2 x7
+#define d3 x8
+
+S2N_BN_SYMBOL(bignum_sub_p256k1):
+
+// First just subtract the numbers as [d3; d2; d1; d0] = x - y,
+// with the inverted carry flag meaning CF <=> x >= y.
+
+        ldp     d0, d1, [x]
+        ldp     l, c, [y]
+        subs    d0, d0, l
+        sbcs    d1, d1, c
+        ldp     d2, d3, [x, #16]
+        ldp     l, c, [y, #16]
+        sbcs    d2, d2, l
+        sbcs    d3, d3, c
+
+// Now if x < y we want to add back p_256k1, which staying within 4 digits
+// means subtracting 4294968273, since p_256k1 = 2^256 - 4294968273.
+// Let c be that constant 4294968273 when x < y, zero otherwise.
+
+        mov     l, #977
+        orr     c, l, #0x100000000
+        csel    c, c, xzr, cc
+
+// Now correct by adding masked p_256k1, i.e. subtracting c
+
+        subs    d0, d0, c
+        sbcs    d1, d1, xzr
+        sbcs    d2, d2, xzr
+        sbc     d3, d3, xzr
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_tomont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_tomont_p256k1.S
new file mode 100644
index 00000000000..b9284870de5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_tomont_p256k1.S
@@ -0,0 +1,101 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert to Montgomery form z := (2^256 * x) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_tomont_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256k1)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256k1_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define m x2
+
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define a0 x7
+#define a1 x8
+
+#define a2 x9
+#define c x9
+
+#define a3 x10
+#define h x10
+#define q x10
+
+S2N_BN_SYMBOL(bignum_tomont_p256k1):
+
+S2N_BN_SYMBOL(bignum_tomont_p256k1_alt):
+
+// Since 2^256 == 4294968273 (mod p_256k1) we more or less just set
+// m = 4294968273 then devolve to a near-clone of bignum_cmul_p256k1;
+// the logic that q = h + 1 < 2^64 and hence doesn't wrap still holds
+// since the multiplier 4294968273 is known to be much less than 2^64.
+// We can also re-use the initial constant m instead of re-creating it.
+
+        mov     m, #977
+        orr     m, m, #0x100000000
+
+// First do the multiply, straightforwardly to get [h;d3;d2;d1;d0]
+
+        ldp     a0, a1, [x]
+        ldp     a2, a3, [x, #16]
+        mul     d0, m, a0
+        mul     d1, m, a1
+        mul     d2, m, a2
+        mul     d3, m, a3
+        umulh   a0, m, a0
+        umulh   a1, m, a1
+        umulh   a2, m, a2
+        umulh   h, m, a3
+        adds    d1, d1, a0
+        adcs    d2, d2, a1
+        adcs    d3, d3, a2
+        adcs    h, h, xzr
+
+// Now the quotient estimate is q = h + 1, and then we do the reduction,
+// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 =
+// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q)
+
+        add     q, h, #1
+        mul     a0, q, m
+        umulh   a1, q, m
+        adds    d0, d0, a0
+        adcs    d1, d1, a1
+        adcs    d2, d2, xzr
+        adcs    d3, d3, xzr
+
+// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF
+// means z' < 0 so we add p_256k1, which in 4 digits means subtracting m.
+
+        csel    m, m, xzr, cc
+        subs    d0, d0, m
+        sbcs    d1, d1, xzr
+        sbcs    d2, d2, xzr
+        sbcs    d3, d3, xzr
+
+// Finally store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_triple_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_triple_p256k1.S
new file mode 100644
index 00000000000..803ca582f06
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_triple_p256k1.S
@@ -0,0 +1,102 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Triple modulo p_256k1, z := (3 * x) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_triple_p256k1
+//      (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The input x can be any 4-digit bignum, not necessarily reduced modulo
+// p_256k1, and the result is always fully reduced, z = (3 * x) mod p_256k1.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256k1)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256k1_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define h x6
+
+// Slightly offset aliases for the d_i for readability.
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+
+// More aliases for the same thing at different stages
+
+#define m x6
+
+// Other temporary variables
+
+#define c x7
+
+S2N_BN_SYMBOL(bignum_triple_p256k1):
+S2N_BN_SYMBOL(bignum_triple_p256k1_alt):
+
+// Load the inputs
+
+        ldp     a0, a1, [x]
+        ldp     a2, a3, [x, #16]
+
+// First do the multiplication by 3, getting z = [h; d3; ...; d0]
+
+        adds    d0, a0, a0, lsl #1
+        extr    d1, a1, a0, #63
+        adcs    d1, d1, a1
+        extr    d2, a2, a1, #63
+        adcs    d2, d2, a2
+        extr    d3, a3, a2, #63
+        adcs    d3, d3, a3
+        lsr     h, a3, #63
+        adc     h, h, xzr
+
+// For this limited range a simple quotient estimate of q = h + 1 works, where
+// h = floor(z / 2^256). Then -p_256k1 <= z - q * p_256k1 < p_256k1.
+
+        mov     c, #977
+        orr     c, c, #0x100000000
+        madd    m, h, c, c
+
+// Initial subtraction of z - q * p_256k1, actually by adding q * 4294968273.
+
+        adds    d0, d0, m
+        adcs    d1, d1, xzr
+        adcs    d2, d2, xzr
+        adcs    d3, d3, xzr
+
+// With z = 2^256 * h + l, the underlying result z' is actually
+// (2^256 * h + l) - q * (2^256 - 4294968273) = (l + q * 4294968273) - 2^256
+// so carry-clear <=> z' is negative. Correct by subtracting in that case.
+
+        csel    c, c, xzr, cc
+        subs    d0, d0, c
+        sbcs    d1, d1, xzr
+        sbcs    d2, d2, xzr
+        sbc     d3, d3, xzr
+
+// Finally store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd.S
new file mode 100644
index 00000000000..52545b3fd83
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd.S
@@ -0,0 +1,549 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input points p1 and p2 are
+// fully reduced mod p_256k1, that both z coordinates are nonzero and
+// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents
+// the same affine point as".
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jadd)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x19
+#define input_x x20
+#define input_y x21
+
+// The magic constant 2^256 - p_256k1
+
+#define pconst x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+#define z_2 input_y, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define x1a sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define z2sq sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define y1a sp, #(NUMSIZE*6)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds exactly to bignum_mul_p256k1 except for registers and
+// re-use of the pconst register for the constant 4294968273
+
+#define mul_p256k1(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x8, x3, x5 __LF                    \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x16, lo __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        cinv    x16, x16, lo __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        umulh   x12, x3, x5 __LF                   \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x16, lo __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        cinv    x16, x16, lo __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, lo __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, lo __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x9, lo __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, lo __LF                    \
+        cinv    x9, x9, lo __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #1 __LF                        \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x16, #977 __LF                     \
+        mul     x3, pconst, x11 __LF               \
+        umulh   x5, pconst, x11 __LF               \
+        and     x15, x12, #0xffffffff __LF         \
+        lsr     x2, x12, #32 __LF                  \
+        mul     x4, x16, x15 __LF                  \
+        madd    x15, x16, x2, x15 __LF             \
+        adds    x4, x4, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x6, x2, x15 __LF                   \
+        mul     x11, pconst, x13 __LF              \
+        umulh   x13, pconst, x13 __LF              \
+        and     x15, x14, #0xffffffff __LF         \
+        lsr     x2, x14, #32 __LF                  \
+        mul     x12, x16, x15 __LF                 \
+        madd    x15, x16, x2, x15 __LF             \
+        adds    x12, x12, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x14, x2, x15 __LF                  \
+        adds    x7, x7, x3 __LF                    \
+        adcs    x8, x8, x4 __LF                    \
+        adcs    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        cset    x11, hs __LF                       \
+        adds    x8, x8, x5 __LF                    \
+        adcs    x9, x9, x6 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, x14 __LF                 \
+        add     x0, x11, #1 __LF                   \
+        mul     x3, x16, x0 __LF                   \
+        lsr     x4, x0, #32 __LF                   \
+        adds    x3, x3, x0, lsl #32 __LF           \
+        adc     x4, xzr, x4 __LF                   \
+        adds    x7, x7, x3 __LF                    \
+        adcs    x8, x8, x4 __LF                    \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        csel    x1, pconst, xzr, lo __LF           \
+        subs    x7, x7, x1 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// Corresponds exactly to bignum_sqr_p256k1 except for
+// re-use of the pconst register for the constant 4294968273
+
+#define sqr_p256k1(P0,P1)                       \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, lo __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, lo __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x16, #977 __LF                     \
+        mul     x10, pconst, x6 __LF               \
+        umulh   x13, pconst, x6 __LF               \
+        and     x6, x7, #0xffffffff __LF           \
+        lsr     x7, x7, #32 __LF                   \
+        mul     x11, x16, x6 __LF                  \
+        madd    x6, x16, x7, x6 __LF               \
+        adds    x11, x11, x6, lsl #32 __LF         \
+        lsr     x6, x6, #32 __LF                   \
+        adc     x14, x7, x6 __LF                   \
+        mul     x12, pconst, x8 __LF               \
+        umulh   x8, pconst, x8 __LF                \
+        and     x6, x9, #0xffffffff __LF           \
+        lsr     x7, x9, #32 __LF                   \
+        mul     x9, x16, x6 __LF                   \
+        madd    x6, x16, x7, x6 __LF               \
+        adds    x9, x9, x6, lsl #32 __LF           \
+        lsr     x6, x6, #32 __LF                   \
+        adc     x15, x7, x6 __LF                   \
+        adds    x2, x2, x10 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adcs    x4, x4, x12 __LF                   \
+        adcs    x5, x5, x9 __LF                    \
+        cset    x6, hs __LF                        \
+        adds    x3, x3, x13 __LF                   \
+        adcs    x4, x4, x14 __LF                   \
+        adcs    x5, x5, x8 __LF                    \
+        adc     x6, x6, x15 __LF                   \
+        add     x6, x6, #1 __LF                    \
+        mul     x10, x16, x6 __LF                  \
+        lsr     x11, x6, #32 __LF                  \
+        adds    x10, x10, x6, lsl #32 __LF         \
+        adc     x11, xzr, x11 __LF                 \
+        adds    x2, x2, x10 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        csel    x16, pconst, xzr, lo __LF          \
+        subs    x2, x2, x16 __LF                   \
+        sbcs    x3, x3, xzr __LF                   \
+        sbcs    x4, x4, xzr __LF                   \
+        sbc     x5, x5, xzr __LF                   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16]
+
+// Corresponds exactly to bignum_sub_p256k1
+
+#define sub_p256k1(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #0x3d1 __LF                    \
+        orr     x3, x4, #0x100000000 __LF          \
+        csel    x3, x3, xzr, cc __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(secp256k1_jadd):
+
+// Save registers and make room on stack for temporary variables
+
+        sub     sp, sp, NSPACE+32
+        stp     x19, x20, [sp, NSPACE]
+        stp     x21, x22, [sp, NSPACE+16]
+
+// Move the input arguments to stable place
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Set up pconst =  4294968273, so p_256k1 = 2^256 - pconst
+
+        mov     pconst, #977
+        orr     pconst, pconst, #0x100000000
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p256k1(z1sq,z_1)
+        sqr_p256k1(z2sq,z_2)
+
+        mul_p256k1(y1a,z_2,y_1)
+        mul_p256k1(y2a,z_1,y_2)
+
+        mul_p256k1(x2a,z1sq,x_2)
+        mul_p256k1(x1a,z2sq,x_1)
+        mul_p256k1(y2a,z1sq,y2a)
+        mul_p256k1(y1a,z2sq,y1a)
+
+        sub_p256k1(xd,x2a,x1a)
+        sub_p256k1(yd,y2a,y1a)
+
+        sqr_p256k1(zz,xd)
+        sqr_p256k1(ww,yd)
+
+        mul_p256k1(zzx1,zz,x1a)
+        mul_p256k1(zzx2,zz,x2a)
+
+        sub_p256k1(resx,ww,zzx1)
+        sub_p256k1(t1,zzx2,zzx1)
+
+        mul_p256k1(xd,xd,z_1)
+
+        sub_p256k1(resx,resx,zzx2)
+
+        sub_p256k1(t2,zzx1,resx)
+
+        mul_p256k1(t1,t1,y1a)
+        mul_p256k1(resz,xd,z_2)
+        mul_p256k1(t2,yd,t2)
+
+        sub_p256k1(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
+// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne
+
+        ldp     x4, x5, [z_2]
+        ldp     x6, x7, [z_2+16]
+
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne
+
+        cmp     x13, x12
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        ldp     x8, x9, [resz]
+        csel    x8, x0, x8, lo
+        csel    x9, x1, x9, lo
+        csel    x8, x4, x8, hi
+        csel    x9, x5, x9, hi
+        ldp     x10, x11, [resz+16]
+        csel    x10, x2, x10, lo
+        csel    x11, x3, x11, lo
+        csel    x10, x6, x10, hi
+        csel    x11, x7, x11, hi
+
+        ldp     x12, x13, [x_1]
+        ldp     x0, x1, [resx]
+        csel    x0, x12, x0, lo
+        csel    x1, x13, x1, lo
+        ldp     x12, x13, [x_2]
+        csel    x0, x12, x0, hi
+        csel    x1, x13, x1, hi
+
+        ldp     x12, x13, [x_1+16]
+        ldp     x2, x3, [resx+16]
+        csel    x2, x12, x2, lo
+        csel    x3, x13, x3, lo
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x12, x2, hi
+        csel    x3, x13, x3, hi
+
+        ldp     x12, x13, [y_1]
+        ldp     x4, x5, [resy]
+        csel    x4, x12, x4, lo
+        csel    x5, x13, x5, lo
+        ldp     x12, x13, [y_2]
+        csel    x4, x12, x4, hi
+        csel    x5, x13, x5, hi
+
+        ldp     x12, x13, [y_1+16]
+        ldp     x6, x7, [resy+16]
+        csel    x6, x12, x6, lo
+        csel    x7, x13, x7, lo
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x12, x6, hi
+        csel    x7, x13, x7, hi
+
+// Finally store back the multiplexed values
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore stack and return
+
+        ldp     x19, x20, [sp, NSPACE]
+        ldp     x21, x22, [sp, NSPACE+16]
+        add     sp, sp, NSPACE+32
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd_alt.S
new file mode 100644
index 00000000000..b62656fccb7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd_alt.S
@@ -0,0 +1,421 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input points p1 and p2 are
+// fully reduced mod p_256k1, that both z coordinates are nonzero and
+// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents
+// the same affine point as".
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x15
+#define input_x x16
+#define input_y x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+#define z_2 input_y, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define x1a sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define z2sq sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define y1a sp, #(NUMSIZE*6)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds exactly to bignum_mul_p256k1_alt
+
+#define mul_p256k1(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x3d1 __LF                    \
+        orr     x7, x7, #0x100000000 __LF          \
+        mul     x11, x7, x1 __LF                   \
+        umulh   x9, x7, x1 __LF                    \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x0, x0, x11 __LF                   \
+        cset    x1, cs __LF                        \
+        adds    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x0, x0, x4 __LF                    \
+        adc     x1, x1, x5 __LF                    \
+        add     x8, x1, #0x1 __LF                  \
+        mul     x11, x7, x8 __LF                   \
+        umulh   x9, x7, x8 __LF                    \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adcs    x0, x0, xzr __LF                   \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x0, x0, xzr __LF                   \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16]
+
+// Corresponds exactly to bignum_sqr_p256k1_alt
+
+#define sqr_p256k1(P0,P1)                       \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x3d1 __LF                    \
+        orr     x3, x3, #0x100000000 __LF          \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adcs    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        add     x2, x12, #0x1 __LF                 \
+        mul     x7, x3, x2 __LF                    \
+        umulh   x6, x3, x2 __LF                    \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x6 __LF                    \
+        adcs    x10, x10, xzr __LF                 \
+        adcs    x11, x11, xzr __LF                 \
+        csel    x3, x3, xzr, cc __LF               \
+        subs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Corresponds exactly to bignum_sub_p256k1
+
+#define sub_p256k1(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #0x3d1 __LF                    \
+        orr     x3, x4, #0x100000000 __LF          \
+        csel    x3, x3, xzr, cc __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(secp256k1_jadd_alt):
+
+// Make room on stack for temporary variables
+// Move the input arguments to stable places
+
+        sub     sp, sp, NSPACE
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p256k1(z1sq,z_1)
+        sqr_p256k1(z2sq,z_2)
+
+        mul_p256k1(y1a,z_2,y_1)
+        mul_p256k1(y2a,z_1,y_2)
+
+        mul_p256k1(x2a,z1sq,x_2)
+        mul_p256k1(x1a,z2sq,x_1)
+        mul_p256k1(y2a,z1sq,y2a)
+        mul_p256k1(y1a,z2sq,y1a)
+
+        sub_p256k1(xd,x2a,x1a)
+        sub_p256k1(yd,y2a,y1a)
+
+        sqr_p256k1(zz,xd)
+        sqr_p256k1(ww,yd)
+
+        mul_p256k1(zzx1,zz,x1a)
+        mul_p256k1(zzx2,zz,x2a)
+
+        sub_p256k1(resx,ww,zzx1)
+        sub_p256k1(t1,zzx2,zzx1)
+
+        mul_p256k1(xd,xd,z_1)
+
+        sub_p256k1(resx,resx,zzx2)
+
+        sub_p256k1(t2,zzx1,resx)
+
+        mul_p256k1(t1,t1,y1a)
+        mul_p256k1(resz,xd,z_2)
+        mul_p256k1(t2,yd,t2)
+
+        sub_p256k1(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
+// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne
+
+        ldp     x4, x5, [z_2]
+        ldp     x6, x7, [z_2+16]
+
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne
+
+        cmp     x13, x12
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        ldp     x8, x9, [resz]
+        csel    x8, x0, x8, lo
+        csel    x9, x1, x9, lo
+        csel    x8, x4, x8, hi
+        csel    x9, x5, x9, hi
+        ldp     x10, x11, [resz+16]
+        csel    x10, x2, x10, lo
+        csel    x11, x3, x11, lo
+        csel    x10, x6, x10, hi
+        csel    x11, x7, x11, hi
+
+        ldp     x12, x13, [x_1]
+        ldp     x0, x1, [resx]
+        csel    x0, x12, x0, lo
+        csel    x1, x13, x1, lo
+        ldp     x12, x13, [x_2]
+        csel    x0, x12, x0, hi
+        csel    x1, x13, x1, hi
+
+        ldp     x12, x13, [x_1+16]
+        ldp     x2, x3, [resx+16]
+        csel    x2, x12, x2, lo
+        csel    x3, x13, x3, lo
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x12, x2, hi
+        csel    x3, x13, x3, hi
+
+        ldp     x12, x13, [y_1]
+        ldp     x4, x5, [resy]
+        csel    x4, x12, x4, lo
+        csel    x5, x13, x5, lo
+        ldp     x12, x13, [y_2]
+        csel    x4, x12, x4, hi
+        csel    x5, x13, x5, hi
+
+        ldp     x12, x13, [y_1+16]
+        ldp     x6, x7, [resy+16]
+        csel    x6, x12, x6, lo
+        csel    x7, x13, x7, lo
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x12, x6, hi
+        csel    x7, x13, x7, hi
+
+// Finally store back the multiplexed values
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore stack and return
+
+        add     sp, sp, NSPACE
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble.S
new file mode 100644
index 00000000000..22b30d022e4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble.S
@@ -0,0 +1,890 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jdouble
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input point are fully
+// reduced mod p_256k1 and that the z coordinate is not zero.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jdouble)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x19
+#define input_x x20
+
+// The magic constant 2^256 - p_256k1
+
+#define pconst x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries
+
+#define x_2 sp, #(NUMSIZE*0)
+#define y_2 sp, #(NUMSIZE*1)
+#define d sp, #(NUMSIZE*2)
+#define tmp sp, #(NUMSIZE*3)
+#define x_4 sp, #(NUMSIZE*4)
+#define y_4 sp, #(NUMSIZE*6)
+#define dx2 sp, #(NUMSIZE*8)
+#define xy2 sp, #(NUMSIZE*10)
+
+#define NSPACE #(NUMSIZE*12)
+
+// Corresponds exactly to bignum_mul_p256k1 except for registers and
+// re-use of the pconst register for the constant 4294968273
+
+#define mul_p256k1(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x8, x3, x5 __LF                    \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x16, lo __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        cinv    x16, x16, lo __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        umulh   x12, x3, x5 __LF                   \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x16, lo __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        cinv    x16, x16, lo __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, lo __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, lo __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x9, lo __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, lo __LF                    \
+        cinv    x9, x9, lo __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #1 __LF                        \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x16, #977 __LF                     \
+        mul     x3, pconst, x11 __LF               \
+        umulh   x5, pconst, x11 __LF               \
+        and     x15, x12, #0xffffffff __LF         \
+        lsr     x2, x12, #32 __LF                  \
+        mul     x4, x16, x15 __LF                  \
+        madd    x15, x16, x2, x15 __LF             \
+        adds    x4, x4, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x6, x2, x15 __LF                   \
+        mul     x11, pconst, x13 __LF              \
+        umulh   x13, pconst, x13 __LF              \
+        and     x15, x14, #0xffffffff __LF         \
+        lsr     x2, x14, #32 __LF                  \
+        mul     x12, x16, x15 __LF                 \
+        madd    x15, x16, x2, x15 __LF             \
+        adds    x12, x12, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x14, x2, x15 __LF                  \
+        adds    x7, x7, x3 __LF                    \
+        adcs    x8, x8, x4 __LF                    \
+        adcs    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        cset    x11, hs __LF                       \
+        adds    x8, x8, x5 __LF                    \
+        adcs    x9, x9, x6 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, x14 __LF                 \
+        add     x0, x11, #1 __LF                   \
+        mul     x3, x16, x0 __LF                   \
+        lsr     x4, x0, #32 __LF                   \
+        adds    x3, x3, x0, lsl #32 __LF           \
+        adc     x4, xzr, x4 __LF                   \
+        adds    x7, x7, x3 __LF                    \
+        adcs    x8, x8, x4 __LF                    \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        csel    x1, pconst, xzr, lo __LF           \
+        subs    x7, x7, x1 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// Corresponds exactly to bignum_sqr_p256k1 except for
+// re-use of the pconst register for the constant 4294968273
+
+#define sqr_p256k1(P0,P1)                       \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, lo __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, lo __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x16, #977 __LF                     \
+        mul     x10, pconst, x6 __LF               \
+        umulh   x13, pconst, x6 __LF               \
+        and     x6, x7, #0xffffffff __LF           \
+        lsr     x7, x7, #32 __LF                   \
+        mul     x11, x16, x6 __LF                  \
+        madd    x6, x16, x7, x6 __LF               \
+        adds    x11, x11, x6, lsl #32 __LF         \
+        lsr     x6, x6, #32 __LF                   \
+        adc     x14, x7, x6 __LF                   \
+        mul     x12, pconst, x8 __LF               \
+        umulh   x8, pconst, x8 __LF                \
+        and     x6, x9, #0xffffffff __LF           \
+        lsr     x7, x9, #32 __LF                   \
+        mul     x9, x16, x6 __LF                   \
+        madd    x6, x16, x7, x6 __LF               \
+        adds    x9, x9, x6, lsl #32 __LF           \
+        lsr     x6, x6, #32 __LF                   \
+        adc     x15, x7, x6 __LF                   \
+        adds    x2, x2, x10 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adcs    x4, x4, x12 __LF                   \
+        adcs    x5, x5, x9 __LF                    \
+        cset    x6, hs __LF                        \
+        adds    x3, x3, x13 __LF                   \
+        adcs    x4, x4, x14 __LF                   \
+        adcs    x5, x5, x8 __LF                    \
+        adc     x6, x6, x15 __LF                   \
+        add     x6, x6, #1 __LF                    \
+        mul     x10, x16, x6 __LF                  \
+        lsr     x11, x6, #32 __LF                  \
+        adds    x10, x10, x6, lsl #32 __LF         \
+        adc     x11, xzr, x11 __LF                 \
+        adds    x2, x2, x10 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        csel    x16, pconst, xzr, lo __LF          \
+        subs    x2, x2, x16 __LF                   \
+        sbcs    x3, x3, xzr __LF                   \
+        sbcs    x4, x4, xzr __LF                   \
+        sbc     x5, x5, xzr __LF                   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16]
+
+// Rough versions producing 5-word results
+
+#define roughmul_p256k1(P0,P1,P2)               \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x8, x3, x5 __LF                    \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x16, lo __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        cinv    x16, x16, lo __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        umulh   x12, x3, x5 __LF                   \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x16, lo __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        cinv    x16, x16, lo __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, lo __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, lo __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x9, lo __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, lo __LF                    \
+        cinv    x9, x9, lo __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #1 __LF                        \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x16, #977 __LF                     \
+        mul     x3, pconst, x11 __LF               \
+        umulh   x5, pconst, x11 __LF               \
+        and     x15, x12, #0xffffffff __LF         \
+        lsr     x2, x12, #32 __LF                  \
+        mul     x4, x16, x15 __LF                  \
+        madd    x15, x16, x2, x15 __LF             \
+        adds    x4, x4, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x6, x2, x15 __LF                   \
+        mul     x11, pconst, x13 __LF              \
+        umulh   x13, pconst, x13 __LF              \
+        and     x15, x14, #0xffffffff __LF         \
+        lsr     x2, x14, #32 __LF                  \
+        mul     x12, x16, x15 __LF                 \
+        madd    x15, x16, x2, x15 __LF             \
+        adds    x12, x12, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x14, x2, x15 __LF                  \
+        adds    x7, x7, x3 __LF                    \
+        adcs    x8, x8, x4 __LF                    \
+        adcs    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        cset    x11, hs __LF                       \
+        adds    x8, x8, x5 __LF                    \
+        adcs    x9, x9, x6 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, x14 __LF                 \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16] __LF              \
+        str     x11, [P0+32]
+
+#define roughsqr_p256k1(P0,P1)                  \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, lo __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, lo __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x16, #977 __LF                     \
+        mul     x10, pconst, x6 __LF               \
+        umulh   x13, pconst, x6 __LF               \
+        and     x6, x7, #0xffffffff __LF           \
+        lsr     x7, x7, #32 __LF                   \
+        mul     x11, x16, x6 __LF                  \
+        madd    x6, x16, x7, x6 __LF               \
+        adds    x11, x11, x6, lsl #32 __LF         \
+        lsr     x6, x6, #32 __LF                   \
+        adc     x14, x7, x6 __LF                   \
+        mul     x12, pconst, x8 __LF               \
+        umulh   x8, pconst, x8 __LF                \
+        and     x6, x9, #0xffffffff __LF           \
+        lsr     x7, x9, #32 __LF                   \
+        mul     x9, x16, x6 __LF                   \
+        madd    x6, x16, x7, x6 __LF               \
+        adds    x9, x9, x6, lsl #32 __LF           \
+        lsr     x6, x6, #32 __LF                   \
+        adc     x15, x7, x6 __LF                   \
+        adds    x2, x2, x10 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adcs    x4, x4, x12 __LF                   \
+        adcs    x5, x5, x9 __LF                    \
+        cset    x6, hs __LF                        \
+        adds    x3, x3, x13 __LF                   \
+        adcs    x4, x4, x14 __LF                   \
+        adcs    x5, x5, x8 __LF                    \
+        adc     x6, x6, x15 __LF                   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16] __LF               \
+        str     x6, [P0+32]
+
+// Weak doubling operation, staying in 4 digits but not in general
+// fully normalizing modulo p_256k1
+
+#define weakdouble_p256k1(P0,P1)                \
+        ldp     x1, x2, [P1] __LF                  \
+        lsl     x0, x1, #1 __LF                    \
+        ldp     x3, x4, [P1+16] __LF               \
+        ands    xzr, x4, #0x8000000000000000 __LF  \
+        csel    x5, pconst, xzr, ne __LF           \
+        extr    x1, x2, x1, #63 __LF               \
+        adds    x0, x0, x5 __LF                    \
+        extr    x2, x3, x2, #63 __LF               \
+        adcs    x1, x1, xzr __LF                   \
+        extr    x3, x4, x3, #63 __LF               \
+        adcs    x2, x2, xzr __LF                   \
+        stp     x0, x1, [P0] __LF                  \
+        adc     x3, x3, xzr __LF                   \
+        stp     x2, x3, [P0+16]
+
+// P0 = C * P1 - D * P2 with 5-word inputs P1 and P2
+// Only used here with C = 12, D = 9, but could be used more generally.
+// We start with (2^40 * 2^256 + C * P1) - (D * P2 + 2^40 * k)
+// where p_256k1 = 2^256 - k (so k = 4294968273)
+
+#define cmsub_p256k1(P0,C,P1,D,P2)              \
+        mov     x10, C __LF                        \
+        ldp     x4, x5, [P1] __LF                  \
+        mul     x0, x4, x10 __LF                   \
+        mul     x1, x5, x10 __LF                   \
+        ldp     x6, x7, [P1+16] __LF               \
+        mul     x2, x6, x10 __LF                   \
+        mul     x3, x7, x10 __LF                   \
+        ldr     x13, [P1+32] __LF                  \
+        umulh   x4, x4, x10 __LF                   \
+        adds    x1, x1, x4 __LF                    \
+        umulh   x5, x5, x10 __LF                   \
+        adcs    x2, x2, x5 __LF                    \
+        umulh   x6, x6, x10 __LF                   \
+        adcs    x3, x3, x6 __LF                    \
+        umulh   x4, x7, x10 __LF                   \
+        mul     x13, x13, x10 __LF                 \
+        adc     x9, x4, x13 __LF                   \
+        orr     x9, x9, #0x10000000000 __LF        \
+        /* [x9; x3;x2;x1;x0] = 2^40 * 2^256 + C * P1 */ \
+        mov     x10, D __LF                        \
+        ldp     x13, x14, [P2] __LF                \
+        mul     x5, x14, x10 __LF                  \
+        umulh   x6, x14, x10 __LF                  \
+        adds    x5, x5, pconst, lsr #24 __LF       \
+        adc     x6, x6, xzr __LF                   \
+        mul     x4, x13, x10 __LF                  \
+        adds    x4, x4, pconst, lsl #40 __LF       \
+        umulh   x13, x13, x10 __LF                 \
+        adcs    x5, x5, x13 __LF                   \
+        ldp     x13, x14, [P2+16] __LF             \
+        mul     x12, x13, x10 __LF                 \
+        umulh   x7, x13, x10 __LF                  \
+        ldr     x13, [P2+32] __LF                  \
+        adcs    x6, x6, x12 __LF                   \
+        mul     x12, x14, x10 __LF                 \
+        umulh   x8, x14, x10 __LF                  \
+        mul     x13, x13, x10 __LF                 \
+        adcs    x7, x7, x12 __LF                   \
+        adc     x8, x8, x13 __LF                   \
+        /* [x8; x7;x6;x5;x4] = D * P2 + 2^40 * k */ \
+        subs    x0, x0, x4 __LF                    \
+        sbcs    x1, x1, x5 __LF                    \
+        sbcs    x2, x2, x6 __LF                    \
+        sbcs    x3, x3, x7 __LF                    \
+        sbc     x4, x9, x8 __LF                    \
+        /* [x4; x3;x2;x1;x0] = 2^40*p_256k1+result */ \
+        add     x10, x4, #1 __LF                   \
+        /* (h + 1) is quotient estimate */      \
+        mul     x4, pconst, x10 __LF               \
+        umulh   x5, pconst, x10 __LF               \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        adcs    x3, x3, xzr __LF                   \
+        csel    x11, pconst, xzr, cc __LF          \
+        /* If un-correction needed */           \
+        subs    x0, x0, x11 __LF                   \
+        sbcs    x1, x1, xzr __LF                   \
+        stp     x0, x1, [P0] __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x3, x3, xzr __LF                   \
+        stp     x2, x3, [P0+16]
+
+// P0 = 3 * P1 - 8 * P2 with 5-digit P1 and P2
+// We start with (2^40 * 2^256 + 3 * P1) - (8 * P2 + 2^40 * k)
+// where p_256k1 = 2^256 - k (so k = 4294968273)
+
+#define cmsub38_p256k1(P0,P1,P2)                \
+        mov     x10, #3 __LF                       \
+        ldp     x4, x5, [P1] __LF                  \
+        mul     x0, x4, x10 __LF                   \
+        mul     x1, x5, x10 __LF                   \
+        ldp     x6, x7, [P1+16] __LF               \
+        mul     x2, x6, x10 __LF                   \
+        mul     x3, x7, x10 __LF                   \
+        ldr     x13, [P1+32] __LF                  \
+        umulh   x4, x4, x10 __LF                   \
+        adds    x1, x1, x4 __LF                    \
+        umulh   x5, x5, x10 __LF                   \
+        adcs    x2, x2, x5 __LF                    \
+        umulh   x6, x6, x10 __LF                   \
+        adcs    x3, x3, x6 __LF                    \
+        umulh   x4, x7, x10 __LF                   \
+        mul     x13, x13, x10 __LF                 \
+        adc     x9, x4, x13 __LF                   \
+        orr     x9, x9, #0x10000000000 __LF        \
+        /*  [x9; x3;x2;x1;x0] = 2^40 * 2^256 + 3 * P1 */ \
+        lsl     x12, pconst, #40 __LF              \
+        ldp     x13, x14, [P2] __LF                \
+        lsl     x4, x13, #3 __LF                   \
+        adds    x4, x4, x12 __LF                   \
+        extr    x5, x14, x13, #61 __LF             \
+        lsr     x12, pconst, #24 __LF              \
+        adcs    x5, x5, x12 __LF                   \
+        ldp     x11, x12, [P2+16] __LF             \
+        extr    x6, x11, x14, #61 __LF             \
+        adcs    x6, x6, xzr __LF                   \
+        ldr     x13, [P2+32] __LF                  \
+        extr    x7, x12, x11, #61 __LF             \
+        adcs    x7, x7, xzr __LF                   \
+        extr    x8, x13, x12, #61 __LF             \
+        adc     x8, x8, xzr __LF                   \
+        /* [x8; x7;x6;x5;x4] = 8 * P2 + 2^40 * k */ \
+        subs    x0, x0, x4 __LF                    \
+        sbcs    x1, x1, x5 __LF                    \
+        sbcs    x2, x2, x6 __LF                    \
+        sbcs    x3, x3, x7 __LF                    \
+        sbc     x4, x9, x8 __LF                    \
+        /* [x4; x3;x2;x1;x0] = 2^40*p_256k1+result */ \
+        add     x10, x4, #1 __LF                   \
+        /* (h + 1) is quotient estimate */      \
+        mul     x4, pconst, x10 __LF               \
+        umulh   x5, pconst, x10 __LF               \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        adcs    x3, x3, xzr __LF                   \
+        csel    x11, pconst, xzr, cc __LF          \
+        /*  If un-correction needed */          \
+        subs    x0, x0, x11 __LF                   \
+        sbcs    x1, x1, xzr __LF                   \
+        stp     x0, x1, [P0] __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x3, x3, xzr __LF                   \
+        stp     x2, x3, [P0+16]
+
+// P0 = 4 * P1 - P2 with 5-digit P1, 4-digit P2 and result.
+// This is done by direct subtraction of P2 since the method
+// in bignum_cmul_p256k1 etc. for quotient estimation still
+// works when the value to be reduced is negative, as
+// long as it is  > -p_256k1, which is the case here.
+
+#define cmsub41_p256k1(P0,P1,P2)                \
+        ldp     x1, x2, [P1] __LF                  \
+        lsl     x0, x1, #2 __LF                    \
+        ldp     x6, x7, [P2] __LF                  \
+        subs    x0, x0, x6 __LF                    \
+        extr    x1, x2, x1, #62 __LF               \
+        sbcs    x1, x1, x7 __LF                    \
+        ldp     x3, x4, [P1+16] __LF               \
+        extr    x2, x3, x2, #62 __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        sbcs    x2, x2, x6 __LF                    \
+        extr    x3, x4, x3, #62 __LF               \
+        sbcs    x3, x3, x7 __LF                    \
+        ldr     x5, [P1+32] __LF                   \
+        extr    x4, x5, x4, #62 __LF               \
+        sbc     x4, x4, xzr __LF                   \
+        add     x5, x4, #1 __LF                    \
+        /* (h + 1) is quotient estimate */      \
+        mul     x4, pconst, x5 __LF                \
+        adds    x0, x0, x4 __LF                    \
+        umulh   x5, pconst, x5 __LF                \
+        adcs    x1, x1, x5 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        adcs    x3, x3, xzr __LF                   \
+        csel    x4, pconst, xzr, cc __LF           \
+        /*  If un-correction needed */          \
+        subs    x0, x0, x4 __LF                    \
+        sbcs    x1, x1, xzr __LF                   \
+        stp     x0, x1, [P0] __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x3, x3, xzr __LF                   \
+        stp     x2, x3, [P0+16]
+
+S2N_BN_SYMBOL(secp256k1_jdouble):
+
+// Save registers and make room on stack for temporary variables
+
+        sub     sp, sp, NSPACE+16
+        stp     x19, x20, [sp, NSPACE]
+
+// Move the input arguments to stable place
+
+        mov     input_z, x0
+        mov     input_x, x1
+
+// Set up pconst =  4294968273, so p_256k1 = 2^256 - pconst
+
+        mov     pconst, #977
+        orr     pconst, pconst, #0x100000000
+
+// Main sequence of operations
+
+        // y_2 = y^2
+
+        sqr_p256k1(y_2,y_1)
+
+        // x_2 = x^2
+
+        sqr_p256k1(x_2,x_1)
+
+        // tmp = 2 * y_1 (in 4 words but not fully normalized)
+
+        weakdouble_p256k1(tmp,y_1)
+
+        // xy2 = x * y^2 (5-digit partially reduced)
+        // x_4 = x^4 (5-digit partially reduced)
+
+        roughmul_p256k1(xy2,x_1,y_2)
+        roughsqr_p256k1(x_4,x_2)
+
+        // z_3 = 2 * y_1 * z_1
+
+        mul_p256k1(z_3,z_1,tmp)
+
+        // d = 12 * xy2 - 9 * x_4
+
+        cmsub_p256k1(d,12,xy2,9,x_4)
+
+        // y4 = y2^2 (5-digit partially reduced)
+
+        roughsqr_p256k1(y_4,y_2)
+
+        // dx2 = d * x_2 (5-digit partially reduced)
+
+        roughmul_p256k1(dx2,x_2,d)
+
+        // x_3 = 4 * xy2 - d
+
+        cmsub41_p256k1(x_3,xy2,d)
+
+        // y_3 = 3 * dx2 - 8 * y_4
+
+        cmsub38_p256k1(y_3,dx2,y_4)
+
+// Restore stack and return
+
+        ldp     x19, x20, [sp, NSPACE]
+        add     sp, sp, NSPACE+16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble_alt.S
new file mode 100644
index 00000000000..4af92a167a7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble_alt.S
@@ -0,0 +1,660 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jdouble_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input point are fully
+// reduced mod p_256k1 and that the z coordinate is not zero.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jdouble_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x15
+#define input_x x16
+
+// The magic constant 2^256 - p_256k1
+
+#define pconst x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries
+
+#define x_2 sp, #(NUMSIZE*0)
+#define y_2 sp, #(NUMSIZE*1)
+#define d sp, #(NUMSIZE*2)
+#define tmp sp, #(NUMSIZE*3)
+#define x_4 sp, #(NUMSIZE*4)
+#define y_4 sp, #(NUMSIZE*6)
+#define dx2 sp, #(NUMSIZE*8)
+#define xy2 sp, #(NUMSIZE*10)
+
+#define NSPACE #(NUMSIZE*12)
+
+// Corresponds exactly to bignum_mul_p256k1_alt except for
+// re-use of the pconst register for the constant 4294968273
+
+#define mul_p256k1(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mul     x11, pconst, x1 __LF               \
+        umulh   x9, pconst, x1 __LF                \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, pconst, x3 __LF               \
+        umulh   x3, pconst, x3 __LF                \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, pconst, x4 __LF               \
+        umulh   x4, pconst, x4 __LF                \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, pconst, x5 __LF               \
+        umulh   x5, pconst, x5 __LF                \
+        adcs    x0, x0, x11 __LF                   \
+        cset    x1, cs __LF                        \
+        adds    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x0, x0, x4 __LF                    \
+        adc     x1, x1, x5 __LF                    \
+        add     x8, x1, #0x1 __LF                  \
+        mul     x11, pconst, x8 __LF               \
+        umulh   x9, pconst, x8 __LF                \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adcs    x0, x0, xzr __LF                   \
+        csel    x7, pconst, xzr, cc __LF           \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x0, x0, xzr __LF                   \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16]
+
+// Corresponds exactly to bignum_sqr_p256k1_alt except for
+// re-use of the pconst register for the constant 4294968273
+
+#define sqr_p256k1(P0,P1)                       \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mul     x7, pconst, x12 __LF               \
+        umulh   x4, pconst, x12 __LF               \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, pconst, x13 __LF               \
+        umulh   x13, pconst, x13 __LF              \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, pconst, x14 __LF               \
+        umulh   x14, pconst, x14 __LF              \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, pconst, x6 __LF                \
+        umulh   x6, pconst, x6 __LF                \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adcs    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        add     x2, x12, #0x1 __LF                 \
+        mul     x7, pconst, x2 __LF                \
+        umulh   x6, pconst, x2 __LF                \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x6 __LF                    \
+        adcs    x10, x10, xzr __LF                 \
+        adcs    x11, x11, xzr __LF                 \
+        csel    x3, pconst, xzr, cc __LF           \
+        subs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Rough versions producing 5-word results
+
+#define roughmul_p256k1(P0,P1,P2)               \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mul     x11, pconst, x1 __LF               \
+        umulh   x9, pconst, x1 __LF                \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, pconst, x3 __LF               \
+        umulh   x3, pconst, x3 __LF                \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, pconst, x4 __LF               \
+        umulh   x4, pconst, x4 __LF                \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, pconst, x5 __LF               \
+        umulh   x5, pconst, x5 __LF                \
+        adcs    x0, x0, x11 __LF                   \
+        cset    x1, cs __LF                        \
+        adds    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x0, x0, x4 __LF                    \
+        adc     x1, x1, x5 __LF                    \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16] __LF              \
+        str     x1, [P0+32]
+
+#define roughsqr_p256k1(P0,P1)                  \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mul     x7, pconst, x12 __LF               \
+        umulh   x4, pconst, x12 __LF               \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, pconst, x13 __LF               \
+        umulh   x13, pconst, x13 __LF              \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, pconst, x14 __LF               \
+        umulh   x14, pconst, x14 __LF              \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, pconst, x6 __LF                \
+        umulh   x6, pconst, x6 __LF                \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adcs    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16] __LF             \
+        str     x12, [P0+32]
+
+// Weak doubling operation, staying in 4 digits but not in general
+// fully normalizing modulo p_256k1
+
+#define weakdouble_p256k1(P0,P1)                \
+        ldp     x1, x2, [P1] __LF                  \
+        lsl     x0, x1, #1 __LF                    \
+        ldp     x3, x4, [P1+16] __LF               \
+        ands    xzr, x4, #0x8000000000000000 __LF  \
+        csel    x5, pconst, xzr, ne __LF           \
+        extr    x1, x2, x1, #63 __LF               \
+        adds    x0, x0, x5 __LF                    \
+        extr    x2, x3, x2, #63 __LF               \
+        adcs    x1, x1, xzr __LF                   \
+        extr    x3, x4, x3, #63 __LF               \
+        adcs    x2, x2, xzr __LF                   \
+        stp     x0, x1, [P0] __LF                  \
+        adc     x3, x3, xzr __LF                   \
+        stp     x2, x3, [P0+16]
+
+// P0 = C * P1 - D * P2 with 5-word inputs P1 and P2
+// Only used here with C = 12, D = 9, but could be used more generally.
+// We start with (2^40 * 2^256 + C * P1) - (D * P2 + 2^40 * k)
+// where p_256k1 = 2^256 - k (so k = 4294968273)
+
+#define cmsub_p256k1(P0,C,P1,D,P2)              \
+        mov     x10, C __LF                        \
+        ldp     x4, x5, [P1] __LF                  \
+        mul     x0, x4, x10 __LF                   \
+        mul     x1, x5, x10 __LF                   \
+        ldp     x6, x7, [P1+16] __LF               \
+        mul     x2, x6, x10 __LF                   \
+        mul     x3, x7, x10 __LF                   \
+        ldr     x13, [P1+32] __LF                  \
+        umulh   x4, x4, x10 __LF                   \
+        adds    x1, x1, x4 __LF                    \
+        umulh   x5, x5, x10 __LF                   \
+        adcs    x2, x2, x5 __LF                    \
+        umulh   x6, x6, x10 __LF                   \
+        adcs    x3, x3, x6 __LF                    \
+        umulh   x4, x7, x10 __LF                   \
+        mul     x13, x13, x10 __LF                 \
+        adc     x9, x4, x13 __LF                   \
+        orr     x9, x9, #0x10000000000 __LF        \
+        /* [x9; x3;x2;x1;x0] = 2^40 * 2^256 + C * P1 */ \
+        mov     x10, D __LF                        \
+        ldp     x13, x14, [P2] __LF                \
+        mul     x5, x14, x10 __LF                  \
+        umulh   x6, x14, x10 __LF                  \
+        adds    x5, x5, pconst, lsr #24 __LF       \
+        adc     x6, x6, xzr __LF                   \
+        mul     x4, x13, x10 __LF                  \
+        adds    x4, x4, pconst, lsl #40 __LF       \
+        umulh   x13, x13, x10 __LF                 \
+        adcs    x5, x5, x13 __LF                   \
+        ldp     x13, x14, [P2+16] __LF             \
+        mul     x12, x13, x10 __LF                 \
+        umulh   x7, x13, x10 __LF                  \
+        ldr     x13, [P2+32] __LF                  \
+        adcs    x6, x6, x12 __LF                   \
+        mul     x12, x14, x10 __LF                 \
+        umulh   x8, x14, x10 __LF                  \
+        mul     x13, x13, x10 __LF                 \
+        adcs    x7, x7, x12 __LF                   \
+        adc     x8, x8, x13 __LF                   \
+        /* [x8; x7;x6;x5;x4] = D * P2 + 2^40 * k */ \
+        subs    x0, x0, x4 __LF                    \
+        sbcs    x1, x1, x5 __LF                    \
+        sbcs    x2, x2, x6 __LF                    \
+        sbcs    x3, x3, x7 __LF                    \
+        sbc     x4, x9, x8 __LF                    \
+        /* [x4; x3;x2;x1;x0] = 2^40*p_256k1+result */ \
+        add     x10, x4, #1 __LF                   \
+        /* (h + 1) is quotient estimate */      \
+        mul     x4, pconst, x10 __LF               \
+        umulh   x5, pconst, x10 __LF               \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        adcs    x3, x3, xzr __LF                   \
+        csel    x11, pconst, xzr, cc __LF          \
+        /* If un-correction needed */           \
+        subs    x0, x0, x11 __LF                   \
+        sbcs    x1, x1, xzr __LF                   \
+        stp     x0, x1, [P0] __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x3, x3, xzr __LF                   \
+        stp     x2, x3, [P0+16]
+
+// P0 = 3 * P1 - 8 * P2 with 5-digit P1 and P2
+// We start with (2^40 * 2^256 + 3 * P1) - (8 * P2 + 2^40 * k)
+// where p_256k1 = 2^256 - k (so k = 4294968273)
+
+#define cmsub38_p256k1(P0,P1,P2)                \
+        mov     x10, #3 __LF                       \
+        ldp     x4, x5, [P1] __LF                  \
+        mul     x0, x4, x10 __LF                   \
+        mul     x1, x5, x10 __LF                   \
+        ldp     x6, x7, [P1+16] __LF               \
+        mul     x2, x6, x10 __LF                   \
+        mul     x3, x7, x10 __LF                   \
+        ldr     x13, [P1+32] __LF                  \
+        umulh   x4, x4, x10 __LF                   \
+        adds    x1, x1, x4 __LF                    \
+        umulh   x5, x5, x10 __LF                   \
+        adcs    x2, x2, x5 __LF                    \
+        umulh   x6, x6, x10 __LF                   \
+        adcs    x3, x3, x6 __LF                    \
+        umulh   x4, x7, x10 __LF                   \
+        mul     x13, x13, x10 __LF                 \
+        adc     x9, x4, x13 __LF                   \
+        orr     x9, x9, #0x10000000000 __LF        \
+        /*  [x9; x3;x2;x1;x0] = 2^40 * 2^256 + 3 * P1 */ \
+        lsl     x12, pconst, #40 __LF              \
+        ldp     x13, x14, [P2] __LF                \
+        lsl     x4, x13, #3 __LF                   \
+        adds    x4, x4, x12 __LF                   \
+        extr    x5, x14, x13, #61 __LF             \
+        lsr     x12, pconst, #24 __LF              \
+        adcs    x5, x5, x12 __LF                   \
+        ldp     x11, x12, [P2+16] __LF             \
+        extr    x6, x11, x14, #61 __LF             \
+        adcs    x6, x6, xzr __LF                   \
+        ldr     x13, [P2+32] __LF                  \
+        extr    x7, x12, x11, #61 __LF             \
+        adcs    x7, x7, xzr __LF                   \
+        extr    x8, x13, x12, #61 __LF             \
+        adc     x8, x8, xzr __LF                   \
+        /* [x8; x7;x6;x5;x4] = 8 * P2 + 2^40 * k */ \
+        subs    x0, x0, x4 __LF                    \
+        sbcs    x1, x1, x5 __LF                    \
+        sbcs    x2, x2, x6 __LF                    \
+        sbcs    x3, x3, x7 __LF                    \
+        sbc     x4, x9, x8 __LF                    \
+        /* [x4; x3;x2;x1;x0] = 2^40*p_256k1+result */ \
+        add     x10, x4, #1 __LF                   \
+        /* (h + 1) is quotient estimate */      \
+        mul     x4, pconst, x10 __LF               \
+        umulh   x5, pconst, x10 __LF               \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x5 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        adcs    x3, x3, xzr __LF                   \
+        csel    x11, pconst, xzr, cc __LF          \
+        /*  If un-correction needed */          \
+        subs    x0, x0, x11 __LF                   \
+        sbcs    x1, x1, xzr __LF                   \
+        stp     x0, x1, [P0] __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x3, x3, xzr __LF                   \
+        stp     x2, x3, [P0+16]
+
+// P0 = 4 * P1 - P2 with 5-digit P1, 4-digit P2 and result.
+// This is done by direct subtraction of P2 since the method
+// in bignum_cmul_p256k1 etc. for quotient estimation still
+// works when the value to be reduced is negative, as
+// long as it is  > -p_256k1, which is the case here.
+
+#define cmsub41_p256k1(P0,P1,P2)                \
+        ldp     x1, x2, [P1] __LF                  \
+        lsl     x0, x1, #2 __LF                    \
+        ldp     x6, x7, [P2] __LF                  \
+        subs    x0, x0, x6 __LF                    \
+        extr    x1, x2, x1, #62 __LF               \
+        sbcs    x1, x1, x7 __LF                    \
+        ldp     x3, x4, [P1+16] __LF               \
+        extr    x2, x3, x2, #62 __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        sbcs    x2, x2, x6 __LF                    \
+        extr    x3, x4, x3, #62 __LF               \
+        sbcs    x3, x3, x7 __LF                    \
+        ldr     x5, [P1+32] __LF                   \
+        extr    x4, x5, x4, #62 __LF               \
+        sbc     x4, x4, xzr __LF                   \
+        add     x5, x4, #1 __LF                    \
+        /* (h + 1) is quotient estimate */      \
+        mul     x4, pconst, x5 __LF                \
+        adds    x0, x0, x4 __LF                    \
+        umulh   x5, pconst, x5 __LF                \
+        adcs    x1, x1, x5 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        adcs    x3, x3, xzr __LF                   \
+        csel    x4, pconst, xzr, cc __LF           \
+        /*  If un-correction needed */          \
+        subs    x0, x0, x4 __LF                    \
+        sbcs    x1, x1, xzr __LF                   \
+        stp     x0, x1, [P0] __LF                  \
+        sbcs    x2, x2, xzr __LF                   \
+        sbc     x3, x3, xzr __LF                   \
+        stp     x2, x3, [P0+16]
+
+S2N_BN_SYMBOL(secp256k1_jdouble_alt):
+
+// Make room on stack for temp registers
+
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable place
+
+        mov     input_z, x0
+        mov     input_x, x1
+
+// Set up pconst =  4294968273, so p_256k1 = 2^256 - pconst
+
+        mov     pconst, #977
+        orr     pconst, pconst, #0x100000000
+
+// Main sequence of operations
+
+        // y_2 = y^2
+
+        sqr_p256k1(y_2,y_1)
+
+        // x_2 = x^2
+
+        sqr_p256k1(x_2,x_1)
+
+        // tmp = 2 * y_1 (in 4 words but not fully normalized)
+
+        weakdouble_p256k1(tmp,y_1)
+
+        // xy2 = x * y^2 (5-digit partially reduced)
+        // x_4 = x^4 (5-digit partially reduced)
+
+        roughmul_p256k1(xy2,x_1,y_2)
+        roughsqr_p256k1(x_4,x_2)
+
+        // z_3 = 2 * y_1 * z_1
+
+        mul_p256k1(z_3,z_1,tmp)
+
+        // d = 12 * xy2 - 9 * x_4
+
+        cmsub_p256k1(d,12,xy2,9,x_4)
+
+        // y4 = y2^2 (5-digit partially reduced)
+
+        roughsqr_p256k1(y_4,y_2)
+
+        // dx2 = d * x_2 (5-digit partially reduced)
+
+        roughmul_p256k1(dx2,x_2,d)
+
+        // x_3 = 4 * xy2 - d
+
+        cmsub41_p256k1(x_3,xy2,d)
+
+        // y_3 = 3 * dx2 - 8 * y_4
+
+        cmsub38_p256k1(y_3,dx2,y_4)
+
+// Restore stack and return
+
+        add     sp, sp, NSPACE
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd.S
new file mode 100644
index 00000000000..660a7ebb18b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd.S
@@ -0,0 +1,507 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jmixadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity. It is assumed that
+// all the coordinates of the input points p1 and p2 are fully reduced
+// mod p_256k1, that the z coordinate of p1 is nonzero and that neither
+// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine
+// point as".
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jmixadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jmixadd)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x19
+#define input_x x20
+#define input_y x21
+
+// The magic constant 2^256 - p_256k1
+
+#define pconst x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds exactly to bignum_mul_p256k1 except for registers and
+// re-use of the pconst register for the constant 4294968273
+
+#define mul_p256k1(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P2] __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x8, x3, x5 __LF                    \
+        mul     x9, x4, x6 __LF                    \
+        umulh   x10, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x16, lo __LF                       \
+        adds    x9, x9, x8 __LF                    \
+        adc     x10, x10, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        cinv    x16, x16, lo __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x8, x7, x9 __LF                    \
+        adcs    x9, x9, x10 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x8, x15, x8 __LF                   \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x9, x3, x9 __LF                    \
+        adc     x10, x10, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x5, x6, [P2+16] __LF               \
+        mul     x11, x3, x5 __LF                   \
+        umulh   x12, x3, x5 __LF                   \
+        mul     x13, x4, x6 __LF                   \
+        umulh   x14, x4, x6 __LF                   \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x16, lo __LF                       \
+        adds    x13, x13, x12 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        subs    x3, x5, x6 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        cinv    x16, x16, lo __LF                  \
+        mul     x15, x4, x3 __LF                   \
+        umulh   x3, x4, x3 __LF                    \
+        adds    x12, x11, x13 __LF                 \
+        adcs    x13, x13, x14 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x12, x15, x12 __LF                 \
+        eor     x3, x3, x16 __LF                   \
+        adcs    x13, x3, x13 __LF                  \
+        adc     x14, x14, x16 __LF                 \
+        ldp     x3, x4, [P1+16] __LF               \
+        ldp     x15, x16, [P1] __LF                \
+        subs    x3, x3, x15 __LF                   \
+        sbcs    x4, x4, x16 __LF                   \
+        csetm   x16, lo __LF                       \
+        ldp     x15, x0, [P2] __LF                 \
+        subs    x5, x15, x5 __LF                   \
+        sbcs    x6, x0, x6 __LF                    \
+        csetm   x0, lo __LF                        \
+        eor     x3, x3, x16 __LF                   \
+        subs    x3, x3, x16 __LF                   \
+        eor     x4, x4, x16 __LF                   \
+        sbc     x4, x4, x16 __LF                   \
+        eor     x5, x5, x0 __LF                    \
+        subs    x5, x5, x0 __LF                    \
+        eor     x6, x6, x0 __LF                    \
+        sbc     x6, x6, x0 __LF                    \
+        eor     x16, x0, x16 __LF                  \
+        adds    x11, x11, x9 __LF                  \
+        adcs    x12, x12, x10 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        mul     x2, x3, x5 __LF                    \
+        umulh   x0, x3, x5 __LF                    \
+        mul     x15, x4, x6 __LF                   \
+        umulh   x1, x4, x6 __LF                    \
+        subs    x4, x4, x3 __LF                    \
+        cneg    x4, x4, lo __LF                    \
+        csetm   x9, lo __LF                        \
+        adds    x15, x15, x0 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        subs    x6, x5, x6 __LF                    \
+        cneg    x6, x6, lo __LF                    \
+        cinv    x9, x9, lo __LF                    \
+        mul     x5, x4, x6 __LF                    \
+        umulh   x6, x4, x6 __LF                    \
+        adds    x0, x2, x15 __LF                   \
+        adcs    x15, x15, x1 __LF                  \
+        adc     x1, x1, xzr __LF                   \
+        cmn     x9, #1 __LF                        \
+        eor     x5, x5, x9 __LF                    \
+        adcs    x0, x5, x0 __LF                    \
+        eor     x6, x6, x9 __LF                    \
+        adcs    x15, x6, x15 __LF                  \
+        adc     x1, x1, x9 __LF                    \
+        adds    x9, x11, x7 __LF                   \
+        adcs    x10, x12, x8 __LF                  \
+        adcs    x11, x13, x11 __LF                 \
+        adcs    x12, x14, x12 __LF                 \
+        adcs    x13, x13, xzr __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        cmn     x16, #1 __LF                       \
+        eor     x2, x2, x16 __LF                   \
+        adcs    x9, x2, x9 __LF                    \
+        eor     x0, x0, x16 __LF                   \
+        adcs    x10, x0, x10 __LF                  \
+        eor     x15, x15, x16 __LF                 \
+        adcs    x11, x15, x11 __LF                 \
+        eor     x1, x1, x16 __LF                   \
+        adcs    x12, x1, x12 __LF                  \
+        adcs    x13, x13, x16 __LF                 \
+        adc     x14, x14, x16 __LF                 \
+        mov     x16, #977 __LF                     \
+        mul     x3, pconst, x11 __LF               \
+        umulh   x5, pconst, x11 __LF               \
+        and     x15, x12, #0xffffffff __LF         \
+        lsr     x2, x12, #32 __LF                  \
+        mul     x4, x16, x15 __LF                  \
+        madd    x15, x16, x2, x15 __LF             \
+        adds    x4, x4, x15, lsl #32 __LF          \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x6, x2, x15 __LF                   \
+        mul     x11, pconst, x13 __LF              \
+        umulh   x13, pconst, x13 __LF              \
+        and     x15, x14, #0xffffffff __LF         \
+        lsr     x2, x14, #32 __LF                  \
+        mul     x12, x16, x15 __LF                 \
+        madd    x15, x16, x2, x15 __LF             \
+        adds    x12, x12, x15, lsl #32 __LF        \
+        lsr     x15, x15, #32 __LF                 \
+        adc     x14, x2, x15 __LF                  \
+        adds    x7, x7, x3 __LF                    \
+        adcs    x8, x8, x4 __LF                    \
+        adcs    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        cset    x11, hs __LF                       \
+        adds    x8, x8, x5 __LF                    \
+        adcs    x9, x9, x6 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adc     x11, x11, x14 __LF                 \
+        add     x0, x11, #1 __LF                   \
+        mul     x3, x16, x0 __LF                   \
+        lsr     x4, x0, #32 __LF                   \
+        adds    x3, x3, x0, lsl #32 __LF           \
+        adc     x4, xzr, x4 __LF                   \
+        adds    x7, x7, x3 __LF                    \
+        adcs    x8, x8, x4 __LF                    \
+        adcs    x9, x9, xzr __LF                   \
+        adcs    x10, x10, xzr __LF                 \
+        csel    x1, pconst, xzr, lo __LF           \
+        subs    x7, x7, x1 __LF                    \
+        sbcs    x8, x8, xzr __LF                   \
+        sbcs    x9, x9, xzr __LF                   \
+        sbc     x10, x10, xzr __LF                 \
+        stp     x7, x8, [P0] __LF                  \
+        stp     x9, x10, [P0+16]
+
+// Corresponds exactly to bignum_sqr_p256k1 except for
+// re-use of the pconst register for the constant 4294968273
+
+#define sqr_p256k1(P0,P1)                       \
+        ldp     x10, x11, [P1] __LF                \
+        ldp     x12, x13, [P1+16] __LF             \
+        umull   x2, w10, w10 __LF                  \
+        lsr     x14, x10, #32 __LF                 \
+        umull   x3, w14, w14 __LF                  \
+        umull   x14, w10, w14 __LF                 \
+        adds    x2, x2, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x3, x3, x14 __LF                   \
+        umull   x4, w11, w11 __LF                  \
+        lsr     x14, x11, #32 __LF                 \
+        umull   x5, w14, w14 __LF                  \
+        umull   x14, w11, w14 __LF                 \
+        mul     x15, x10, x11 __LF                 \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x4, x4, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x5, x5, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x5, x5, xzr __LF                   \
+        adds    x3, x3, x15 __LF                   \
+        adcs    x4, x4, x16 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umull   x6, w12, w12 __LF                  \
+        lsr     x14, x12, #32 __LF                 \
+        umull   x7, w14, w14 __LF                  \
+        umull   x14, w12, w14 __LF                 \
+        adds    x6, x6, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x7, x7, x14 __LF                   \
+        umull   x8, w13, w13 __LF                  \
+        lsr     x14, x13, #32 __LF                 \
+        umull   x9, w14, w14 __LF                  \
+        umull   x14, w13, w14 __LF                 \
+        mul     x15, x12, x13 __LF                 \
+        umulh   x16, x12, x13 __LF                 \
+        adds    x8, x8, x14, lsl #33 __LF          \
+        lsr     x14, x14, #31 __LF                 \
+        adc     x9, x9, x14 __LF                   \
+        adds    x15, x15, x15 __LF                 \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x9, x9, xzr __LF                   \
+        adds    x7, x7, x15 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        subs    x10, x10, x12 __LF                 \
+        sbcs    x11, x11, x13 __LF                 \
+        csetm   x16, lo __LF                       \
+        eor     x10, x10, x16 __LF                 \
+        subs    x10, x10, x16 __LF                 \
+        eor     x11, x11, x16 __LF                 \
+        sbc     x11, x11, x16 __LF                 \
+        adds    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x5 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        umull   x12, w10, w10 __LF                 \
+        lsr     x5, x10, #32 __LF                  \
+        umull   x13, w5, w5 __LF                   \
+        umull   x5, w10, w5 __LF                   \
+        adds    x12, x12, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x13, x13, x5 __LF                  \
+        umull   x15, w11, w11 __LF                 \
+        lsr     x5, x11, #32 __LF                  \
+        umull   x14, w5, w5 __LF                   \
+        umull   x5, w11, w5 __LF                   \
+        mul     x4, x10, x11 __LF                  \
+        umulh   x16, x10, x11 __LF                 \
+        adds    x15, x15, x5, lsl #33 __LF         \
+        lsr     x5, x5, #31 __LF                   \
+        adc     x14, x14, x5 __LF                  \
+        adds    x4, x4, x4 __LF                    \
+        adcs    x16, x16, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x13, x13, x4 __LF                  \
+        adcs    x15, x15, x16 __LF                 \
+        adc     x14, x14, xzr __LF                 \
+        adds    x4, x2, x6 __LF                    \
+        adcs    x5, x3, x7 __LF                    \
+        adcs    x6, x6, x8 __LF                    \
+        adcs    x7, x7, x9 __LF                    \
+        csetm   x16, lo __LF                       \
+        subs    x4, x4, x12 __LF                   \
+        sbcs    x5, x5, x13 __LF                   \
+        sbcs    x6, x6, x15 __LF                   \
+        sbcs    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x16 __LF                   \
+        adc     x9, x9, x16 __LF                   \
+        mov     x16, #977 __LF                     \
+        mul     x10, pconst, x6 __LF               \
+        umulh   x13, pconst, x6 __LF               \
+        and     x6, x7, #0xffffffff __LF           \
+        lsr     x7, x7, #32 __LF                   \
+        mul     x11, x16, x6 __LF                  \
+        madd    x6, x16, x7, x6 __LF               \
+        adds    x11, x11, x6, lsl #32 __LF         \
+        lsr     x6, x6, #32 __LF                   \
+        adc     x14, x7, x6 __LF                   \
+        mul     x12, pconst, x8 __LF               \
+        umulh   x8, pconst, x8 __LF                \
+        and     x6, x9, #0xffffffff __LF           \
+        lsr     x7, x9, #32 __LF                   \
+        mul     x9, x16, x6 __LF                   \
+        madd    x6, x16, x7, x6 __LF               \
+        adds    x9, x9, x6, lsl #32 __LF           \
+        lsr     x6, x6, #32 __LF                   \
+        adc     x15, x7, x6 __LF                   \
+        adds    x2, x2, x10 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adcs    x4, x4, x12 __LF                   \
+        adcs    x5, x5, x9 __LF                    \
+        cset    x6, hs __LF                        \
+        adds    x3, x3, x13 __LF                   \
+        adcs    x4, x4, x14 __LF                   \
+        adcs    x5, x5, x8 __LF                    \
+        adc     x6, x6, x15 __LF                   \
+        add     x6, x6, #1 __LF                    \
+        mul     x10, x16, x6 __LF                  \
+        lsr     x11, x6, #32 __LF                  \
+        adds    x10, x10, x6, lsl #32 __LF         \
+        adc     x11, xzr, x11 __LF                 \
+        adds    x2, x2, x10 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adcs    x4, x4, xzr __LF                   \
+        adcs    x5, x5, xzr __LF                   \
+        csel    x16, pconst, xzr, lo __LF          \
+        subs    x2, x2, x16 __LF                   \
+        sbcs    x3, x3, xzr __LF                   \
+        sbcs    x4, x4, xzr __LF                   \
+        sbc     x5, x5, xzr __LF                   \
+        stp     x2, x3, [P0] __LF                  \
+        stp     x4, x5, [P0+16]
+
+// Corresponds exactly to bignum_sub_p256k1
+
+#define sub_p256k1(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #0x3d1 __LF                    \
+        orr     x3, x4, #0x100000000 __LF          \
+        csel    x3, x3, xzr, cc __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(secp256k1_jmixadd):
+
+// Save registers and make room on stack for temporary variables
+
+        sub     sp, sp, NSPACE+32
+        stp     x19, x20, [sp, NSPACE]
+        stp     x21, x22, [sp, NSPACE+16]
+
+// Move the input arguments to stable place
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Set up pconst =  4294968273, so p_256k1 = 2^256 - pconst
+
+        mov     pconst, #977
+        orr     pconst, pconst, #0x100000000
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p256k1(zp2,z_1)
+        mul_p256k1(y2a,z_1,y_2)
+
+        mul_p256k1(x2a,zp2,x_2)
+        mul_p256k1(y2a,zp2,y2a)
+
+        sub_p256k1(xd,x2a,x_1)
+        sub_p256k1(yd,y2a,y_1)
+
+        sqr_p256k1(zz,xd)
+        sqr_p256k1(ww,yd)
+
+        mul_p256k1(zzx1,zz,x_1)
+        mul_p256k1(zzx2,zz,x2a)
+
+        sub_p256k1(resx,ww,zzx1)
+        sub_p256k1(t1,zzx2,zzx1)
+
+        mul_p256k1(resz,xd,z_1)
+
+        sub_p256k1(resx,resx,zzx2)
+
+        sub_p256k1(t2,zzx1,resx)
+
+        mul_p256k1(t1,t1,y_1)
+        mul_p256k1(t2,yd,t2)
+
+        sub_p256k1(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with an extra z = 1
+// coordinate, hence giving 0 + p2 = p2 for the final result.
+
+        ldp     x0, x1, [resx]
+        ldp     x12, x13, [x_2]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [resx+16]
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+
+        ldp     x4, x5, [resy]
+        ldp     x12, x13, [y_2]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [resy+16]
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+
+        ldp     x8, x9, [resz]
+        mov     x12, #1
+        csel    x8, x8, x12, ne
+        csel    x9, x9, xzr, ne
+        ldp     x10, x11, [resz+16]
+        csel    x10, x10, xzr, ne
+        csel    x11, x11, xzr, ne
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore stack and return
+
+        ldp     x19, x20, [sp, NSPACE]
+        ldp     x21, x22, [sp, NSPACE+16]
+        add     sp, sp, NSPACE+32
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd_alt.S
new file mode 100644
index 00000000000..d0135945453
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd_alt.S
@@ -0,0 +1,379 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jmixadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity. It is assumed that
+// all the coordinates of the input points p1 and p2 are fully reduced
+// mod p_256k1, that the z coordinate of p1 is nonzero and that neither
+// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine
+// point as".
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jmixadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jmixadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x15
+#define input_x x16
+#define input_y x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds exactly to bignum_mul_p256k1_alt
+
+#define mul_p256k1(P0,P1,P2)                    \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        mov     x7, #0x3d1 __LF                    \
+        orr     x7, x7, #0x100000000 __LF          \
+        mul     x11, x7, x1 __LF                   \
+        umulh   x9, x7, x1 __LF                    \
+        adds    x12, x12, x11 __LF                 \
+        mul     x11, x7, x3 __LF                   \
+        umulh   x3, x7, x3 __LF                    \
+        adcs    x13, x13, x11 __LF                 \
+        mul     x11, x7, x4 __LF                   \
+        umulh   x4, x7, x4 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x7, x5 __LF                   \
+        umulh   x5, x7, x5 __LF                    \
+        adcs    x0, x0, x11 __LF                   \
+        cset    x1, cs __LF                        \
+        adds    x13, x13, x9 __LF                  \
+        adcs    x14, x14, x3 __LF                  \
+        adcs    x0, x0, x4 __LF                    \
+        adc     x1, x1, x5 __LF                    \
+        add     x8, x1, #0x1 __LF                  \
+        mul     x11, x7, x8 __LF                   \
+        umulh   x9, x7, x8 __LF                    \
+        adds    x12, x12, x11 __LF                 \
+        adcs    x13, x13, x9 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adcs    x0, x0, xzr __LF                   \
+        csel    x7, x7, xzr, cc __LF               \
+        subs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, xzr __LF                 \
+        sbcs    x14, x14, xzr __LF                 \
+        sbc     x0, x0, xzr __LF                   \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16]
+
+// Corresponds exactly to bignum_sqr_p256k1_alt
+
+#define sqr_p256k1(P0,P1)                       \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x7, x2, x4 __LF                    \
+        umulh   x6, x2, x4 __LF                    \
+        adds    x10, x10, x7 __LF                  \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x7, x3, x4 __LF                    \
+        umulh   x6, x3, x4 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x11, x11, x7 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x6 __LF                  \
+        mul     x7, x3, x5 __LF                    \
+        umulh   x6, x3, x5 __LF                    \
+        adc     x6, x6, xzr __LF                   \
+        adds    x12, x12, x7 __LF                  \
+        adcs    x13, x13, x6 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x6, cs __LF                        \
+        umulh   x7, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x7 __LF                    \
+        mul     x7, x3, x3 __LF                    \
+        adcs    x10, x10, x7 __LF                  \
+        umulh   x7, x3, x3 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x7, x4, x4 __LF                    \
+        adcs    x12, x12, x7 __LF                  \
+        umulh   x7, x4, x4 __LF                    \
+        adcs    x13, x13, x7 __LF                  \
+        mul     x7, x5, x5 __LF                    \
+        adcs    x14, x14, x7 __LF                  \
+        umulh   x7, x5, x5 __LF                    \
+        adc     x6, x6, x7 __LF                    \
+        mov     x3, #0x3d1 __LF                    \
+        orr     x3, x3, #0x100000000 __LF          \
+        mul     x7, x3, x12 __LF                   \
+        umulh   x4, x3, x12 __LF                   \
+        adds    x8, x8, x7 __LF                    \
+        mul     x7, x3, x13 __LF                   \
+        umulh   x13, x3, x13 __LF                  \
+        adcs    x9, x9, x7 __LF                    \
+        mul     x7, x3, x14 __LF                   \
+        umulh   x14, x3, x14 __LF                  \
+        adcs    x10, x10, x7 __LF                  \
+        mul     x7, x3, x6 __LF                    \
+        umulh   x6, x3, x6 __LF                    \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x12, cs __LF                       \
+        adds    x9, x9, x4 __LF                    \
+        adcs    x10, x10, x13 __LF                 \
+        adcs    x11, x11, x14 __LF                 \
+        adc     x12, x12, x6 __LF                  \
+        add     x2, x12, #0x1 __LF                 \
+        mul     x7, x3, x2 __LF                    \
+        umulh   x6, x3, x2 __LF                    \
+        adds    x8, x8, x7 __LF                    \
+        adcs    x9, x9, x6 __LF                    \
+        adcs    x10, x10, xzr __LF                 \
+        adcs    x11, x11, xzr __LF                 \
+        csel    x3, x3, xzr, cc __LF               \
+        subs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, xzr __LF                   \
+        sbcs    x10, x10, xzr __LF                 \
+        sbc     x11, x11, xzr __LF                 \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Corresponds exactly to bignum_sub_p256k1
+
+#define sub_p256k1(P0,P1,P2)                    \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        mov     x4, #0x3d1 __LF                    \
+        orr     x3, x4, #0x100000000 __LF          \
+        csel    x3, x3, xzr, cc __LF               \
+        subs    x5, x5, x3 __LF                    \
+        sbcs    x6, x6, xzr __LF                   \
+        sbcs    x7, x7, xzr __LF                   \
+        sbc     x8, x8, xzr __LF                   \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(secp256k1_jmixadd_alt):
+
+// Make room on stack for temporary variables
+// Move the input arguments to stable places
+
+        sub     sp, sp, NSPACE
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p256k1(zp2,z_1)
+        mul_p256k1(y2a,z_1,y_2)
+
+        mul_p256k1(x2a,zp2,x_2)
+        mul_p256k1(y2a,zp2,y2a)
+
+        sub_p256k1(xd,x2a,x_1)
+        sub_p256k1(yd,y2a,y_1)
+
+        sqr_p256k1(zz,xd)
+        sqr_p256k1(ww,yd)
+
+        mul_p256k1(zzx1,zz,x_1)
+        mul_p256k1(zzx2,zz,x2a)
+
+        sub_p256k1(resx,ww,zzx1)
+        sub_p256k1(t1,zzx2,zzx1)
+
+        mul_p256k1(resz,xd,z_1)
+
+        sub_p256k1(resx,resx,zzx2)
+
+        sub_p256k1(t2,zzx1,resx)
+
+        mul_p256k1(t1,t1,y_1)
+        mul_p256k1(t2,yd,t2)
+
+        sub_p256k1(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with an extra z = 1
+// coordinate, hence giving 0 + p2 = p2 for the final result.
+
+        ldp     x0, x1, [resx]
+        ldp     x12, x13, [x_2]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [resx+16]
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+
+        ldp     x4, x5, [resy]
+        ldp     x12, x13, [y_2]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [resy+16]
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+
+        ldp     x8, x9, [resz]
+        mov     x12, #1
+        csel    x8, x8, x12, ne
+        csel    x9, x9, xzr, ne
+        ldp     x10, x11, [resz+16]
+        csel    x10, x10, xzr, ne
+        csel    x11, x11, xzr, ne
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore stack and return
+
+        add     sp, sp, NSPACE
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/Makefile
new file mode 100644
index 00000000000..216db41a3f8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/Makefile
@@ -0,0 +1,58 @@
+#############################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+#############################################################################
+
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
+# use a cross-assembling version so that the code can still be assembled
+# and the proofs checked against the object files (though you won't be able
+# to run code without additional emulation infrastructure). The aarch64
+# cross-assembling version can be installed manually by something like:
+#
+#  sudo apt-get install binutils-aarch64-linux-gnu
+
+UNAME_RESULT=$(shell uname -p)
+
+ifeq ($(UNAME_RESULT),aarch64)
+GAS=as
+else
+GAS=aarch64-linux-gnu-as
+endif
+
+# List of object files
+
+OBJ = bignum_add_sm2.o \
+      bignum_cmul_sm2.o \
+      bignum_deamont_sm2.o \
+      bignum_demont_sm2.o \
+      bignum_double_sm2.o \
+      bignum_half_sm2.o \
+      bignum_inv_sm2.o \
+      bignum_mod_nsm2.o \
+      bignum_mod_nsm2_4.o \
+      bignum_mod_sm2.o \
+      bignum_mod_sm2_4.o \
+      bignum_montinv_sm2.o \
+      bignum_montmul_sm2.o \
+      bignum_montmul_sm2_alt.o \
+      bignum_montsqr_sm2.o \
+      bignum_montsqr_sm2_alt.o \
+      bignum_neg_sm2.o \
+      bignum_optneg_sm2.o \
+      bignum_sub_sm2.o \
+      bignum_tomont_sm2.o \
+      bignum_triple_sm2.o \
+      sm2_montjadd.o \
+      sm2_montjadd_alt.o \
+      sm2_montjdouble.o \
+      sm2_montjdouble_alt.o \
+      sm2_montjmixadd.o \
+      sm2_montjmixadd_alt.o \
+      sm2_montjscalarmul.o \
+      sm2_montjscalarmul_alt.o
+
+%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ -
+
+default: $(OBJ);
+
+clean:; rm -f *.o *.correct
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_add_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_add_sm2.S
new file mode 100644
index 00000000000..84656cf9cfe
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_add_sm2.S
@@ -0,0 +1,73 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_add_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_sm2)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+#define c x3
+#define d0 x4
+#define d1 x5
+#define d2 x6
+#define d3 x7
+#define n0 x8
+#define n1 x9
+#define n2 x10
+#define n3 x11
+
+S2N_BN_SYMBOL(bignum_add_sm2):
+
+// First just add the numbers as [c;d3;d2;d1;d0]
+
+        ldp     d0, d1, [x]
+        ldp     n0, n1, [y]
+        adds    d0, d0, n0
+        adcs    d1, d1, n1
+        ldp     d2, d3, [x, #16]
+        ldp     n2, n3, [y, #16]
+        adcs    d2, d2, n2
+        adcs    d3, d3, n3
+        adc     c, xzr, xzr
+
+// Now let [c;n3;n2;n1;n0] = [c;d3;d2;d1;d0] - p_sm2
+
+        subs    n0, d0, #0xffffffffffffffff
+        mov     n1, #0xffffffff00000000
+        sbcs    n1, d1, n1
+        adcs    n2, d2, xzr
+        mov     n3, #0xfffffffeffffffff
+        sbcs    n3, d3, n3
+        sbcs    c, c, xzr
+
+// Select result according to whether (x + y) - p_sm2 < 0
+
+        csel    d0, d0, n0, cc
+        csel    d1, d1, n1, cc
+        csel    d2, d2, n2, cc
+        csel    d3, d3, n3, cc
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_cmul_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_cmul_sm2.S
new file mode 100644
index 00000000000..75a54982399
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_cmul_sm2.S
@@ -0,0 +1,103 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_sm2
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = c, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_sm2)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_sm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_sm2_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define m x1
+#define x x2
+
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define h x7
+#define q x8
+#define a1 x9
+#define a2 x10
+#define a3 x11
+#define a4 x12
+
+S2N_BN_SYMBOL(bignum_cmul_sm2):
+S2N_BN_SYMBOL(bignum_cmul_sm2_alt):
+
+// First do the multiply, straightforwardly to get [h;d3;d2;d1;d0]
+
+        ldp     a1, a2, [x]
+        ldp     a3, a4, [x, #16]
+        mul     d0, m, a1
+        mul     d1, m, a2
+        mul     d2, m, a3
+        mul     d3, m, a4
+        umulh   a1, m, a1
+        umulh   a2, m, a2
+        umulh   a3, m, a3
+        umulh   h, m, a4
+        adds    d1, d1, a1
+        adcs    d2, d2, a2
+        adcs    d3, d3, a3
+        adc     h, h, xzr
+
+// Quotient approximation is (h * (1 + 2^32 + 2^64) + d3 + 2^64) >> 64.
+// Note that by hypothesis our product is <= (2^64 - 1) * (p_sm2 - 1),
+// so there is no need to max this out to avoid wrapping, unlike in the
+// more general case of bignum_mod_sm2.
+
+        adds    a3, d3, h
+        mov     a2, #1
+        adc     a1, h, a2
+        add     a2, h, a3, lsr #32
+        add     q, a1, a2, lsr #32
+
+// Let a3 = q<<32 and a4 = q>>32 then [a2;a1] = 2^32 * q - q
+
+        lsl     a3, q, #32
+        subs    a1, a3, q
+        lsr     a4, q, #32
+        sbc     a2, a4, xzr
+
+// Do the basic correction as [h;d3;d2;d1;d0] := [h;d3;d2;d1;d0] - q * p_sm2
+
+        sub     h, h, q
+        adds    d0, d0, q
+        adcs    d1, d1, a1
+        adcs    d2, d2, a2
+        adcs    d3, d3, a3
+        adc     h, h, a4
+
+// Use top word (which will be all zeros or all ones) as a mask to correct
+
+        adds    d0, d0, h
+        and     a1, h, #0xffffffff00000000
+        adcs    d1, d1, a1
+        adcs    d2, d2, h
+        and     a3, h, #0xfffffffeffffffff
+        adc     d3, d3, a3
+
+// Finally store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_deamont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_deamont_sm2.S
new file mode 100644
index 00000000000..0cc467ea18b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_deamont_sm2.S
@@ -0,0 +1,109 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from almost-Montgomery form, z := (x / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_deamont_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form,
+// "almost" meaning any 4-digit input will work, with no range restriction.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_sm2)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as
+// temporaries. It is fine for d4 to be the same register as d0,
+// and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0)                               \
+/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0             */ \
+        lsl     t2, d0, #32 __LF                                               \
+        lsr     t3, d0, #32 __LF                                               \
+        subs    t0, t2, d0 __LF                                                \
+        sbc     t1, t3, xzr __LF                                               \
+/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0]                    */ \
+        subs    d1, d1, t0 __LF                                                \
+        sbcs    d2, d2, t1 __LF                                                \
+        sbcs    d3, d3, t2 __LF                                                \
+        sbc     d4, d0, t3
+
+// Input parameters
+
+#define z x0
+#define x x1
+
+// Rotating registers for the intermediate windows (with repetitions)
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+
+// Other temporaries
+
+#define t x6
+#define u x7
+#define v x8
+#define w x9
+
+S2N_BN_SYMBOL(bignum_deamont_sm2):
+
+// Set up an initial window with the input x
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Systematically scroll left doing 1-step reductions. This process
+// keeps things inside 4 digits (i.e. < 2^256) at each stage, since
+// we have w * p_sm2 + x <= (2^64 - 1) * p_sm2 + (2 EXP 256 - 1)
+// <= (2^64 - 1) * (2^256 - 1) + (2 EXP 256 - 1) <= 2^64 * (2^256 - 1)
+
+        montreds(d0,d3,d2,d1,d0, t,u,v,w)
+
+        montreds(d1,d0,d3,d2,d1, t,u,v,w)
+
+        montreds(d2,d1,d0,d3,d2, t,u,v,w)
+
+        montreds(d3,d2,d1,d0,d3, t,u,v,w)
+
+// Let [w;v;u;t] = [d3;d2;d1;d0] - p_sm2
+
+        subs    t, d0, #-1
+        mov     u, #0xffffffff00000000
+        sbcs    u, d1, u
+        adcs    v, d2, xzr
+        mov     w, #0xfffffffeffffffff
+        sbcs    w, d3, w
+
+// If [d3;d2;d1;d0] < p_sm2 then [d3;d2;d1;d0] is the final answer,
+// being reduced mod p_sm2, otherwise [d3;d2;d1;d0] - p_sm2.
+
+        csel    d0, d0, t, cc
+        csel    d1, d1, u, cc
+        csel    d2, d2, v, cc
+        csel    d3, d3, w, cc
+
+// Write back result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_demont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_demont_sm2.S
new file mode 100644
index 00000000000..a10906d4f1b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_demont_sm2.S
@@ -0,0 +1,90 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_demont_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// This assumes the input is < p_sm2 for correctness. If this is not the case,
+// use the variant "bignum_deamont_sm2" instead.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_sm2)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as
+// temporaries. It is fine for d4 to be the same register as d0,
+// and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0)                               \
+/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0             */ \
+        lsl     t2, d0, #32 __LF                                               \
+        lsr     t3, d0, #32 __LF                                               \
+        subs    t0, t2, d0 __LF                                                \
+        sbc     t1, t3, xzr __LF                                               \
+/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0]                    */ \
+        subs    d1, d1, t0 __LF                                                \
+        sbcs    d2, d2, t1 __LF                                                \
+        sbcs    d3, d3, t2 __LF                                                \
+        sbc     d4, d0, t3
+
+// Input parameters
+
+#define z x0
+#define x x1
+
+// Rotating registers for the intermediate windows (with repetitions)
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+
+// Other temporaries
+
+#define t x6
+#define u x7
+#define v x8
+#define w x9
+
+S2N_BN_SYMBOL(bignum_demont_sm2):
+
+// Set up an initial window with the input x
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Systematically scroll left doing 1-step reductions. This process
+// keeps things reduced < p_sm2 at each stage, since we have
+// w * p_sm2 + x <= (2^64 - 1) * p_sm2 + (p_sm2 - 1) < 2^64 * p_sm2
+
+        montreds(d0,d3,d2,d1,d0, t,u,v,w)
+
+        montreds(d1,d0,d3,d2,d1, t,u,v,w)
+
+        montreds(d2,d1,d0,d3,d2, t,u,v,w)
+
+        montreds(d3,d2,d1,d0,d3, t,u,v,w)
+
+// Write back result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_double_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_double_sm2.S
new file mode 100644
index 00000000000..629c3c33174
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_double_sm2.S
@@ -0,0 +1,72 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_double_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_sm2)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define c x6
+#define n0 x7
+#define n1 x8
+#define n2 x9
+#define n3 x10
+
+S2N_BN_SYMBOL(bignum_double_sm2):
+
+// Double the input number as 2 * x = c + [d3; d2; d1; d0]
+// It's worth considering doing this with extr...63 instead
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+        adds    d0, d0, d0
+        adcs    d1, d1, d1
+        adcs    d2, d2, d2
+        adcs    d3, d3, d3
+        adc     c, xzr, xzr
+
+// Subtract p_sm2 to give 2 * x - p_sm2 = c + [n3; n2; n1; n0]
+
+        subs    n0, d0, #0xffffffffffffffff
+        mov     n1, #0xffffffff00000000
+        sbcs    n1, d1, n1
+        adcs    n2, d2, xzr
+        mov     n3, #0xfffffffeffffffff
+        sbcs    n3, d3, n3
+        sbcs    c, c, xzr
+
+// Now CF is set (because of inversion) if 2 * x >= p_sm2, in which case the
+// correct result is [n3; n2; n1; n0], otherwise [d3; d2; d1; d0]
+
+        csel    d0, d0, n0, cc
+        csel    d1, d1, n1, cc
+        csel    d2, d2, n2, cc
+        csel    d3, d3, n3, cc
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_half_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_half_sm2.S
new file mode 100644
index 00000000000..b144c9757bc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_half_sm2.S
@@ -0,0 +1,72 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_half_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_sm2)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define d4 x6
+#define m x7
+#define n x8
+
+
+S2N_BN_SYMBOL(bignum_half_sm2):
+
+// Load the 4 digits of x
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Get a bitmask corresponding to the lowest bit of the input
+
+        and     m, d0, #1
+        neg     m, m
+
+// Do a masked addition of p_sm2, catching carry in a 5th word
+
+        adds    d0, d0, m
+        and     n, m, #0xffffffff00000000
+        adcs    d1, d1, n
+        adcs    d2, d2, m
+        and     n, m, #0xfffffffeffffffff
+        adcs    d3, d3, n
+        adc     d4, xzr, xzr
+
+// Now shift that sum right one place
+
+        extr    d0, d1, d0, #1
+        extr    d1, d2, d1, #1
+        extr    d2, d3, d2, #1
+        extr    d3, d4, d3, #1
+
+// Store back
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+// Return
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_inv_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_inv_sm2.S
new file mode 100644
index 00000000000..c28197c2da8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_inv_sm2.S
@@ -0,0 +1,1270 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+//
+// extern void bignum_inv_sm2(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// If the 4-digit input x is coprime to p_sm2, i.e. is not divisible
+// by it, returns z < p_sm2 such that x * z == 1 (mod p_sm2). Note that
+// x does not need to be reduced modulo p_sm2, but the output always is.
+// If the input is divisible (i.e. is 0 or p_sm2), then there can be no
+// modular inverse and z = 0 is returned.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_sm2)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Used for the return pointer
+
+#define res x20
+
+// Loop counter and d = 2 * delta value for divstep
+
+#define i x21
+#define d x22
+
+// Registers used for matrix element magnitudes and signs
+
+#define m00 x10
+#define m01 x11
+#define m10 x12
+#define m11 x13
+#define s00 x14
+#define s01 x15
+#define s10 x16
+#define s11 x17
+
+// Initial carries for combinations
+
+#define car0 x9
+#define car1 x19
+
+// Input and output, plain registers treated according to pattern
+
+#define reg0 x0, #0
+#define reg1 x1, #0
+#define reg2 x2, #0
+#define reg3 x3, #0
+#define reg4 x4, #0
+
+#define x x1, #0
+#define z x0, #0
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f sp, #0
+#define g sp, #(6*N)
+#define u sp, #(12*N)
+#define v sp, #(16*N)
+
+// Total size to reserve on the stack
+
+#define NSPACE #(20*N)
+
+// ---------------------------------------------------------------------------
+// Core signed almost-Montgomery reduction macro. Takes input in
+// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to
+// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally
+// as well as t0, t1, t2, t3. This is almost-Montgomery, i.e. the result
+// fits in 4 digits but is not necessarily strictly reduced mod p_sm2.
+// ---------------------------------------------------------------------------
+
+#define amontred(d4,d3,d2,d1,d0, t3,t2,t1,t0)                               \
+/* We only know the input is -2^316 < x < 2^316. To do traditional  */      \
+/* unsigned Montgomery reduction, start by adding 2^61 * p_sm2.     */      \
+        mov     t0, #0xe000000000000000 __LF                                   \
+        adds    d0, d0, t0 __LF                                                \
+        mov     t1, #0x1fffffffffffffff __LF                                   \
+        adcs    d1, d1, t1 __LF                                                \
+        mov     t2, #0xffffffffe0000000 __LF                                   \
+        adcs    d2, d2, t2 __LF                                                \
+        sbcs    d3, d3, xzr __LF                                               \
+        and     t0, t1, #0xffffffffdfffffff __LF                               \
+        adc     d4, d4, t0 __LF                                                \
+/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */             \
+        lsl     t2, d0, #32 __LF                                               \
+        lsr     t3, d0, #32 __LF                                               \
+        subs    t0, t2, d0 __LF                                                \
+        sbc     t1, t3, xzr __LF                                               \
+/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */                    \
+        subs    d1, d1, t0 __LF                                                \
+        sbcs    d2, d2, t1 __LF                                                \
+        sbcs    d3, d3, t2 __LF                                                \
+        sbc     t0, d0, t3 __LF                                                \
+        adds    d4, d4, t0 __LF                                                \
+/* Now capture top carry and subtract p_sm2 if set (almost-Montgomery) */   \
+        csetm   t0, cs __LF                                                    \
+        subs    d1, d1, t0 __LF                                                \
+        and     t1, t0, #0xffffffff00000000 __LF                               \
+        sbcs    d2, d2, t1 __LF                                                \
+        and     t2, t0, #0xfffffffeffffffff __LF                               \
+        sbcs    d3, d3, t0 __LF                                                \
+        sbc     d4, d4, t2
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix in
+// registers as follows
+//
+// [ m00  m01]
+// [ m10  m11]
+
+#define divstep59()                                                     \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x8, x4, #0x100, lsl #12 __LF                               \
+        sbfx    x8, x8, #21, #21 __LF                                      \
+        mov     x11, #0x100000 __LF                                        \
+        add     x11, x11, x11, lsl #21 __LF                                \
+        add     x9, x4, x11 __LF                                           \
+        asr     x9, x9, #42 __LF                                           \
+        add     x10, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x10, x10, #21, #21 __LF                                    \
+        add     x11, x5, x11 __LF                                          \
+        asr     x11, x11, #42 __LF                                         \
+        mul     x6, x8, x2 __LF                                            \
+        mul     x7, x9, x3 __LF                                            \
+        mul     x2, x10, x2 __LF                                           \
+        mul     x3, x11, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #21, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #42 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #21, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #42 __LF                                         \
+        mul     x6, x12, x2 __LF                                           \
+        mul     x7, x13, x3 __LF                                           \
+        mul     x2, x14, x2 __LF                                           \
+        mul     x3, x15, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        mul     x2, x12, x8 __LF                                           \
+        mul     x3, x12, x9 __LF                                           \
+        mul     x6, x14, x8 __LF                                           \
+        mul     x7, x14, x9 __LF                                           \
+        madd    x8, x13, x10, x2 __LF                                      \
+        madd    x9, x13, x11, x3 __LF                                      \
+        madd    x16, x15, x10, x6 __LF                                     \
+        madd    x17, x15, x11, x7 __LF                                     \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #22, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #43 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #22, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #43 __LF                                         \
+        mneg    x2, x12, x8 __LF                                           \
+        mneg    x3, x12, x9 __LF                                           \
+        mneg    x4, x14, x8 __LF                                           \
+        mneg    x5, x14, x9 __LF                                           \
+        msub    m00, x13, x16, x2 __LF                                     \
+        msub    m01, x13, x17, x3 __LF                                     \
+        msub    m10, x15, x16, x4 __LF                                     \
+        msub    m11, x15, x17, x5
+
+S2N_BN_SYMBOL(bignum_inv_sm2):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        stp     x21, x22, [sp, -16]!
+        stp     x23, x24, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Copy the prime and input into the main f and g variables respectively.
+// Make sure x is reduced so that g <= f as assumed in the bound proof.
+
+        mov     x10, #0xffffffffffffffff
+        mov     x11, #0xffffffff00000000
+        mov     x13, #0xfffffffeffffffff
+        stp     x10, x11, [f]
+        stp     x10, x13, [f+2*N]
+        str     xzr, [f+4*N]
+
+        ldp     x2, x3, [x1]
+        subs    x10, x2, #-1
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #(2*N)]
+        adcs    x12, x4, xzr
+        sbcs    x13, x5, x13
+
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+
+        stp     x2, x3, [g]
+        stp     x4, x5, [g+2*N]
+        str     xzr, [g+4*N]
+
+// Also maintain reduced < 2^256 vector [u,v] such that
+// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_sm2)
+// starting with [p_sm2,x] == x * 2^{5*0-50} * [0,2^50] (mod p_sm2)
+// The weird-looking 5*i modifications come in because we are doing
+// 64-bit word-sized Montgomery reductions at each stage, which is
+// 5 bits more than the 59-bit requirement to keep things stable.
+
+        stp     xzr, xzr, [u]
+        stp     xzr, xzr, [u+2*N]
+
+        mov     x10, #0x0004000000000000
+        stp     x10, xzr, [v]
+        stp     xzr, xzr, [v+2*N]
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special tenth iteration after a uniform
+// first 9.
+
+        mov     i, #10
+        mov     d, #1
+        b       bignum_inv_sm2_midloop
+
+bignum_inv_sm2_loop:
+
+// Separate the matrix elements into sign-magnitude pairs
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in stable registers for the [u,v] part and do [f,g] first.
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+        and     x0, m10, s10
+        and     x1, m11, s11
+        add     car1, x0, x1
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        ldr     x7, [f]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [g]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Digit 1 of [f,g]
+
+        ldr     x7, [f+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [g+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [f]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [g]
+
+// Digit 2 of [f,g]
+
+        ldr     x7, [f+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [g+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [f+N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [g+N]
+
+// Digits 3 and 4 of [f,g]
+
+        ldr     x7, [f+3*N]
+        eor     x1, x7, s00
+        ldr     x23, [f+4*N]
+        eor     x3, x23, s00
+        and     x3, x3, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [g+3*N]
+        eor     x1, x8, s01
+        ldr     x24, [g+4*N]
+        eor     x0, x24, s01
+        and     x0, x0, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [f+2*N]
+        extr    x5, x3, x5, #59
+        str     x5, [f+3*N]
+        asr     x3, x3, #59
+        str     x3, [f+4*N]
+
+        eor     x1, x7, s10
+        eor     x5, x23, s10
+        and     x5, x5, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        eor     x0, x24, s11
+        and     x0, x0, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [g+2*N]
+        extr    x2, x5, x2, #59
+        str     x2, [g+3*N]
+        asr     x5, x5, #59
+        str     x5, [g+4*N]
+
+// Now the computation of the updated u and v values and their
+// Montgomery reductions. A very similar accumulation except that
+// the top words of u and v are unsigned and we don't shift.
+//
+// Digit 0 of [u,v]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v]
+        adc     x3, x3, x1
+
+// Digit 1 of [u,v]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        str     x3, [v+N]
+        adc     x4, x4, x1
+
+// Digit 2 of [u,v]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        str     x4, [v+2*N]
+        adc     x2, x2, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Montgomery reduction of u
+
+        ldp     x0, x1, [u]
+        ldr     x6, [u+2*N]
+        amontred(x3,x5,x6,x1,x0, x24,x10,x11,x14)
+        stp     x1, x6, [u]
+        stp     x5, x3, [u+16]
+
+// Digits 3 and 4 of v (top is unsigned)
+
+        eor     x1, x7, s10
+        and     x5, s10, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        and     x0, s11, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+
+// Montgomery reduction of v
+
+        ldp     x0, x1, [v]
+        ldr     x3, [v+2*N]
+        amontred(x5,x2,x3,x1,x0, x24,x10,x11,x14)
+        stp     x1, x3, [v]
+        stp     x2, x5, [v+16]
+
+bignum_inv_sm2_midloop:
+
+        mov     x1, d
+        ldr     x2, [f]
+        ldr     x3, [g]
+        divstep59()
+        mov     d, x1
+
+// Next iteration
+
+        subs    i, i, #1
+        bne     bignum_inv_sm2_loop
+
+// The 10th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        ldr     x0, [f]
+        ldr     x1, [g]
+        mul     x0, x0, m00
+        madd    x1, x1, m01, x0
+        asr     x0, x1, #63
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * [u,v] (mod p_sm2)
+// we want to flip the sign of u according to that of f.
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+        eor     s00, s00, x0
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+        eor     s01, s01, x0
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+        eor     s10, s10, x0
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+        eor     s11, s11, x0
+
+// Adjust the initial value to allow for complement instead of negation
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+// Digit 0 of [u]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+// Digit 1 of [u]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+// Digit 2 of [u]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Montgomery reduction of u. This needs to be strict not "almost"
+// so it is followed by an optional subtraction of p_sm2
+
+        ldp     x0, x1, [u]
+        ldr     x2, [u+2*N]
+        amontred(x3,x5,x2,x1,x0, x24,x10,x11,x14)
+
+        mov     x10, #0xffffffffffffffff
+        subs    x10, x1, #-1
+        mov     x11, #0xffffffff00000000
+        sbcs    x11, x2, x11
+        mov     x13, #0xfffffffeffffffff
+        adcs    x12, x5, xzr
+        sbcs    x13, x3, x13
+
+        csel    x10, x1, x10, cc
+        csel    x11, x2, x11, cc
+        csel    x12, x5, x12, cc
+        csel    x13, x3, x13, cc
+
+// Store it back to the final output
+
+        stp     x10, x11, [res]
+        stp     x12, x13, [res, #16]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2.S
new file mode 100644
index 00000000000..f81048a14a3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2.S
@@ -0,0 +1,175 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_nsm2
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2.
+//
+// Standard ARM ABI: X0 = z, X1 = k, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define k x1
+#define x x2
+
+#define m0 x3
+#define m1 x4
+#define m2 x5
+#define m3 x6
+
+#define t0 x7
+#define t1 x8
+#define t2 x9
+#define t3 x10
+#define t4 x11
+
+#define n0 x12
+#define n1 x13
+#define n3 x14
+
+// These two are aliased: we only load d when finished with q
+
+#define q x15
+#define d x15
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_mod_nsm2):
+
+S2N_BN_SYMBOL(bignum_mod_nsm2_alt):
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmp     k, #4
+        bcc     bignum_mod_nsm2_short
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        sub     k, k, #4
+        lsl     t0, k, #3
+        add     t0, t0, x
+        ldp     m2, m3, [t0, #16]
+        ldp     m0, m1, [t0]
+
+// Load the complicated three words of 2^256 - n_sm2 = [n3; 0; n1; n0]
+
+        movbig(n0, #0xac44, #0x0bf6, #0xc62a, #0xbedd)
+        movbig(n1, #0x8dfc, #0x2094, #0xde39, #0xfad4)
+        mov     n3, 0x0000000100000000
+
+// Reduce the top 4 digits mod n_sm2 (a conditional subtraction of n_sm2)
+
+        adds    t0, m0, n0
+        adcs    t1, m1, n1
+        adcs    t2, m2, xzr
+        adcs    t3, m3, n3
+        csel    m0, m0, t0, cc
+        csel    m1, m1, t1, cc
+        csel    m2, m2, t2, cc
+        csel    m3, m3, t3, cc
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        cbz     k, bignum_mod_nsm2_writeback
+bignum_mod_nsm2_loop:
+
+// Writing the input, with the new zeroth digit implicitly appended, as
+// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is
+// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1)
+
+        adds    t0, m2, m3
+        mov     t2, #1
+        adc     t1, m3, t2
+        add     t2, m3, t0, lsr #32
+        adds    q, t1, t2, lsr #32
+        cinv    q, q, cs
+
+// [t4;t3;t2;t1;t0] = q * (2^256 - n_sm2)
+
+        mul     t0, n0, q
+        mul     t1, n1, q
+        mul     t3, n3, q
+        umulh   t2, n0, q
+        adds    t1, t1, t2
+        umulh   t2, n1, q
+        adc     t2, t2, xzr     // No carry: high of mul + {0,1}
+        umulh   t4, n3, q
+
+// Compensate for 2^256 * q
+
+        sub     m3, m3, q
+
+// Decrement k and load the next digit (note that d aliases to q)
+
+        sub     k, k, #1
+        ldr     d, [x, k, lsl #3]
+
+// [t4;t3;t2;t1;t0] = [m3;m2;m1;m0;d] - q * n_sm2
+
+        adds    t0, d, t0
+        adcs    t1, m0, t1
+        adcs    t2, m1, t2
+        adcs    t3, m2, t3
+        adc     t4, m3, t4
+
+// Now our top word t4 is either zero or all 1s. Use it for a masked
+// addition of n_sm2, which we can do by a *subtraction* of
+// 2^256 - n_sm2 from our portion, re-using the constants
+
+        and     d, t4, n0
+        subs    m0, t0, d
+        and     d, t4, n1
+        sbcs    m1, t1, d
+        sbcs    m2, t2, xzr
+        and     d, t4, n3
+        sbc     m3, t3, d
+
+        cbnz    k, bignum_mod_nsm2_loop
+
+// Finally write back [m3;m2;m1;m0] and return
+
+bignum_mod_nsm2_writeback:
+        stp     m0, m1, [z]
+        stp     m2, m3, [z, #16]
+        ret
+
+// Short case: just copy the input with zero-padding
+
+bignum_mod_nsm2_short:
+        mov     m0, xzr
+        mov     m1, xzr
+        mov     m2, xzr
+        mov     m3, xzr
+
+        cbz     k, bignum_mod_nsm2_writeback
+        ldr     m0, [x]
+        subs    k, k, #1
+        beq     bignum_mod_nsm2_writeback
+        ldr     m1, [x, #8]
+        subs    k, k, #1
+        beq     bignum_mod_nsm2_writeback
+        ldr     m2, [x, #16]
+        b       bignum_mod_nsm2_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2_4.S
new file mode 100644
index 00000000000..dd1bc66bea2
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2_4.S
@@ -0,0 +1,81 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_nsm2_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2_4)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define n0 x2
+#define n1 x3
+#define n2 x4
+#define n3 x5
+
+#define d0 x6
+#define d1 x7
+#define d2 x8
+#define d3 x9
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(bignum_mod_nsm2_4):
+
+// Load the complicated three words of n_sm2, the other being all 1s
+
+       movbig( n0, #0x53BB, #0xF409, #0x39D5, #0x4123)
+       movbig( n1, #0x7203, #0xDF6B, #0x21C6, #0x052B)
+       mov      n3, #0xFFFFFFFEFFFFFFFF
+
+// Load the input number
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Do the subtraction. Since word 2 of n_sm2 is all 1s, that can be
+// done by adding zero with carry, thanks to the inverted carry.
+
+        subs    n0, d0, n0
+        sbcs    n1, d1, n1
+        adcs    n2, d2, xzr
+        sbcs    n3, d3, n3
+
+// Now if the carry is *clear* (inversion at work) the subtraction carried
+// and hence we should have done nothing, so we reset each n_i = d_i
+
+        csel    n0, d0, n0, cc
+        csel    n1, d1, n1, cc
+        csel    n2, d2, n2, cc
+        csel    n3, d3, n3, cc
+
+// Store the end result
+
+        stp     n0, n1, [z]
+        stp     n2, n3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2.S
new file mode 100644
index 00000000000..e847008b1b2
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2.S
@@ -0,0 +1,150 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_sm2
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Standard ARM ABI: X0 = z, X1 = k, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_sm2)
+        .text
+        .balign 4
+
+#define z x0
+#define k x1
+#define x x2
+
+#define m0 x3
+#define m1 x4
+#define m2 x5
+#define m3 x6
+
+#define t0 x7
+#define t1 x8
+#define t2 x9
+#define t3 x10
+#define t4 x11
+
+#define n1 x12
+#define n3 x13
+
+#define q x14
+
+
+S2N_BN_SYMBOL(bignum_mod_sm2):
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmp     k, #4
+        bcc     bignum_mod_sm2_short
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        sub     k, k, #4
+        lsl     t0, k, #3
+        add     t0, t0, x
+        ldp     m2, m3, [t0, #16]
+        ldp     m0, m1, [t0]
+
+// Load the complicated words of p_sm2 = [n3;-1;n1;-1]
+
+        mov     n1, #0xffffffff00000000
+        mov     n3, #0xfffffffeffffffff
+
+// Reduce the top 4 digits mod p_sm2 (a conditional subtraction of p_sm2)
+
+        subs    t0, m0, #-1
+        sbcs    t1, m1, n1
+        adcs    t2, m2, xzr
+        sbcs    t3, m3, n3
+
+        csel    m0, m0, t0, cc
+        csel    m1, m1, t1, cc
+        csel    m2, m2, t2, cc
+        csel    m3, m3, t3, cc
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        cbz     k, bignum_mod_sm2_writeback
+bignum_mod_sm2_loop:
+
+// Decrement k and load the next digit as t0. We then want to reduce
+// [m3;m2;m1;m0;t0] |-> [m3;m2;m1;m0]; the shuffling downwards is absorbed
+// into the various ALU operations
+
+        sub     k, k, #1
+        ldr     t0, [x, k, lsl #3]
+
+// Writing the input, with the new zeroth digit t0 appended, as
+// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is
+// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1)
+
+        adds    t3, m2, m3
+        mov     t2, #1
+        adc     t1, m3, t2
+        add     t2, m3, t3, lsr #32
+        adds    q, t1, t2, lsr #32
+        cinv    q, q, cs
+
+// Let t3 = q<<32 and t4 = q>>32 then [t2;t1] = 2^32 * q - q
+
+        lsl     t3, q, #32
+        subs    t1, t3, q
+        lsr     t4, q, #32
+        sbc     t2, t4, xzr
+
+// Do the basic correction to get [t4;t2;t2;t1;t0] = [m3;m2;m1;m0;t0] - q * p
+
+        adds    t0, t0, q
+        adcs    t1, t1, m0
+        sub     m3, m3, q
+        adcs    t2, t2, m1
+        adcs    t3, t3, m2
+        adc     t4, t4, m3
+
+// Use top word as mask to correct
+
+        adds    m0, t0, t4
+        and     t0, n1, t4
+        adcs    m1, t1, t0
+        adcs    m2, t2, t4
+        and     t0, n3, t4
+        adc     m3, t3, t0
+
+        cbnz    k, bignum_mod_sm2_loop
+
+// Finally write back [m3;m2;m1;m0] and return
+
+bignum_mod_sm2_writeback:
+        stp     m0, m1, [z]
+        stp     m2, m3, [z, #16]
+        ret
+
+// Short case: just copy the input with zero-padding
+
+bignum_mod_sm2_short:
+        mov     m0, xzr
+        mov     m1, xzr
+        mov     m2, xzr
+        mov     m3, xzr
+
+        cbz     k, bignum_mod_sm2_writeback
+        ldr     m0, [x]
+        subs    k, k, #1
+        beq     bignum_mod_sm2_writeback
+        ldr     m1, [x, #8]
+        subs    k, k, #1
+        beq     bignum_mod_sm2_writeback
+        ldr     m2, [x, #16]
+        b       bignum_mod_sm2_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2_4.S
new file mode 100644
index 00000000000..4654f667989
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2_4.S
@@ -0,0 +1,70 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_sm2_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_sm2_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_sm2_4)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define n0 x2
+#define n1 x3
+#define n2 x4
+#define n3 x5
+
+#define d0 x6
+#define d1 x7
+#define d2 x8
+#define d3 x9
+
+
+S2N_BN_SYMBOL(bignum_mod_sm2_4):
+
+// Load the non-trivial words of p_sm2 = [n3;-1;n2;-1]
+
+        mov     n1, #0xffffffff00000000
+        mov     n3, #0xfffffffeffffffff
+
+// Load the input number
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Do the subtraction.
+
+        subs    n0, d0, #-1
+        sbcs    n1, d1, n1
+        adcs    n2, d2, xzr
+        sbcs    n3, d3, n3
+
+// Now if the carry is *clear* (inversion at work) the subtraction carried
+// and hence we should have done nothing, so we reset each n_i = d_i
+
+        csel    n0, d0, n0, cc
+        csel    n1, d1, n1, cc
+        csel    n2, d2, n2, cc
+        csel    n3, d3, n3, cc
+
+// Store the end result
+
+        stp     n0, n1, [z]
+        stp     n2, n3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montinv_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montinv_sm2.S
new file mode 100644
index 00000000000..fbcb136911a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montinv_sm2.S
@@ -0,0 +1,1290 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+//
+// extern void bignum_montinv_sm2(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// If the 4-digit input x is coprime to p_sm2, i.e. is not divisible
+// by it, returns z < p_sm2 such that x * z == 2^512 (mod p_sm2). This
+// is effectively "Montgomery inverse" because if we consider x and z as
+// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z
+// (both mod p_sm2) then X * Z == 1 (mod p_sm2). That is, this function
+// gives the analog of the modular inverse bignum_inv_sm2 but with both
+// input and output in the Montgomery domain. Note that x does not need
+// to be reduced modulo p_sm2, but the output always is. If the input
+// is divisible (i.e. is 0 or p_sm2), then there can be no solution to
+// the congruence x * z == 2^512 (mod p_sm2), and z = 0 is returned.
+
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_sm2)
+
+        .text
+        .balign 4
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Used for the return pointer
+
+#define res x20
+
+// Loop counter and d = 2 * delta value for divstep
+
+#define i x21
+#define d x22
+
+// Registers used for matrix element magnitudes and signs
+
+#define m00 x10
+#define m01 x11
+#define m10 x12
+#define m11 x13
+#define s00 x14
+#define s01 x15
+#define s10 x16
+#define s11 x17
+
+// Initial carries for combinations
+
+#define car0 x9
+#define car1 x19
+
+// Input and output, plain registers treated according to pattern
+
+#define reg0 x0, #0
+#define reg1 x1, #0
+#define reg2 x2, #0
+#define reg3 x3, #0
+#define reg4 x4, #0
+
+#define x x1, #0
+#define z x0, #0
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f sp, #0
+#define g sp, #(6*N)
+#define u sp, #(12*N)
+#define v sp, #(16*N)
+
+// Total size to reserve on the stack
+
+#define NSPACE #(20*N)
+
+// ---------------------------------------------------------------------------
+// Core signed almost-Montgomery reduction macro. Takes input in
+// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to
+// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally
+// as well as t0, t1, t2, t3. This is almost-Montgomery, i.e. the result
+// fits in 4 digits but is not necessarily strictly reduced mod p_sm2.
+// ---------------------------------------------------------------------------
+
+#define amontred(d4,d3,d2,d1,d0, t3,t2,t1,t0)                               \
+/* We only know the input is -2^316 < x < 2^316. To do traditional  */      \
+/* unsigned Montgomery reduction, start by adding 2^61 * p_sm2.     */      \
+        mov     t0, #0xe000000000000000 __LF                                   \
+        adds    d0, d0, t0 __LF                                                \
+        mov     t1, #0x1fffffffffffffff __LF                                   \
+        adcs    d1, d1, t1 __LF                                                \
+        mov     t2, #0xffffffffe0000000 __LF                                   \
+        adcs    d2, d2, t2 __LF                                                \
+        sbcs    d3, d3, xzr __LF                                               \
+        and     t0, t1, #0xffffffffdfffffff __LF                               \
+        adc     d4, d4, t0 __LF                                                \
+/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */             \
+        lsl     t2, d0, #32 __LF                                               \
+        lsr     t3, d0, #32 __LF                                               \
+        subs    t0, t2, d0 __LF                                                \
+        sbc     t1, t3, xzr __LF                                               \
+/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */                    \
+        subs    d1, d1, t0 __LF                                                \
+        sbcs    d2, d2, t1 __LF                                                \
+        sbcs    d3, d3, t2 __LF                                                \
+        sbc     t0, d0, t3 __LF                                                \
+        adds    d4, d4, t0 __LF                                                \
+/* Now capture top carry and subtract p_sm2 if set (almost-Montgomery) */   \
+        csetm   t0, cs __LF                                                    \
+        subs    d1, d1, t0 __LF                                                \
+        and     t1, t0, #0xffffffff00000000 __LF                               \
+        sbcs    d2, d2, t1 __LF                                                \
+        and     t2, t0, #0xfffffffeffffffff __LF                               \
+        sbcs    d3, d3, t0 __LF                                                \
+        sbc     d4, d4, t2
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix in
+// registers as follows
+//
+// [ m00  m01]
+// [ m10  m11]
+
+#define divstep59()                                                     \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x8, x4, #0x100, lsl #12 __LF                               \
+        sbfx    x8, x8, #21, #21 __LF                                      \
+        mov     x11, #0x100000 __LF                                        \
+        add     x11, x11, x11, lsl #21 __LF                                \
+        add     x9, x4, x11 __LF                                           \
+        asr     x9, x9, #42 __LF                                           \
+        add     x10, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x10, x10, #21, #21 __LF                                    \
+        add     x11, x5, x11 __LF                                          \
+        asr     x11, x11, #42 __LF                                         \
+        mul     x6, x8, x2 __LF                                            \
+        mul     x7, x9, x3 __LF                                            \
+        mul     x2, x10, x2 __LF                                           \
+        mul     x3, x11, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #21, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #42 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #21, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #42 __LF                                         \
+        mul     x6, x12, x2 __LF                                           \
+        mul     x7, x13, x3 __LF                                           \
+        mul     x2, x14, x2 __LF                                           \
+        mul     x3, x15, x3 __LF                                           \
+        add     x4, x6, x7 __LF                                            \
+        add     x5, x2, x3 __LF                                            \
+        asr     x2, x4, #20 __LF                                           \
+        asr     x3, x5, #20 __LF                                           \
+        and     x4, x2, #0xfffff __LF                                      \
+        orr     x4, x4, #0xfffffe0000000000 __LF                           \
+        and     x5, x3, #0xfffff __LF                                      \
+        orr     x5, x5, #0xc000000000000000 __LF                           \
+        tst     x5, #0x1 __LF                                              \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        mul     x2, x12, x8 __LF                                           \
+        mul     x3, x12, x9 __LF                                           \
+        mul     x6, x14, x8 __LF                                           \
+        mul     x7, x14, x9 __LF                                           \
+        madd    x8, x13, x10, x2 __LF                                      \
+        madd    x9, x13, x11, x3 __LF                                      \
+        madd    x16, x15, x10, x6 __LF                                     \
+        madd    x17, x15, x11, x7 __LF                                     \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        tst     x5, #0x2 __LF                                              \
+        asr     x5, x5, #1 __LF                                            \
+        csel    x6, x4, xzr, ne __LF                                       \
+        ccmp    x1, xzr, #0x8, ne __LF                                     \
+        cneg    x1, x1, ge __LF                                            \
+        cneg    x6, x6, ge __LF                                            \
+        csel    x4, x5, x4, ge __LF                                        \
+        add     x5, x5, x6 __LF                                            \
+        add     x1, x1, #0x2 __LF                                          \
+        asr     x5, x5, #1 __LF                                            \
+        add     x12, x4, #0x100, lsl #12 __LF                              \
+        sbfx    x12, x12, #22, #21 __LF                                    \
+        mov     x15, #0x100000 __LF                                        \
+        add     x15, x15, x15, lsl #21 __LF                                \
+        add     x13, x4, x15 __LF                                          \
+        asr     x13, x13, #43 __LF                                         \
+        add     x14, x5, #0x100, lsl #12 __LF                              \
+        sbfx    x14, x14, #22, #21 __LF                                    \
+        add     x15, x5, x15 __LF                                          \
+        asr     x15, x15, #43 __LF                                         \
+        mneg    x2, x12, x8 __LF                                           \
+        mneg    x3, x12, x9 __LF                                           \
+        mneg    x4, x14, x8 __LF                                           \
+        mneg    x5, x14, x9 __LF                                           \
+        msub    m00, x13, x16, x2 __LF                                     \
+        msub    m01, x13, x17, x3 __LF                                     \
+        msub    m10, x15, x16, x4 __LF                                     \
+        msub    m11, x15, x17, x5
+
+S2N_BN_SYMBOL(bignum_montinv_sm2):
+
+// Save registers and make room for temporaries
+
+        stp     x19, x20, [sp, -16]!
+        stp     x21, x22, [sp, -16]!
+        stp     x23, x24, [sp, -16]!
+        sub     sp, sp, NSPACE
+
+// Save the return pointer for the end so we can overwrite x0 later
+
+        mov     res, x0
+
+// Copy the prime and input into the main f and g variables respectively.
+// Make sure x is reduced so that g <= f as assumed in the bound proof.
+
+        mov     x10, #0xffffffffffffffff
+        mov     x11, #0xffffffff00000000
+        mov     x13, #0xfffffffeffffffff
+        stp     x10, x11, [f]
+        stp     x10, x13, [f+2*N]
+        str     xzr, [f+4*N]
+
+        ldp     x2, x3, [x1]
+        subs    x10, x2, #-1
+        sbcs    x11, x3, x11
+        ldp     x4, x5, [x1, #(2*N)]
+        adcs    x12, x4, xzr
+        sbcs    x13, x5, x13
+
+        csel    x2, x2, x10, cc
+        csel    x3, x3, x11, cc
+        csel    x4, x4, x12, cc
+        csel    x5, x5, x13, cc
+
+        stp     x2, x3, [g]
+        stp     x4, x5, [g+2*N]
+        str     xzr, [g+4*N]
+
+// Also maintain reduced < 2^256 vector [u,v] such that
+// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_sm2)
+// starting with [p_sm2,x] == x * 2^{5*0-562} * [0,2^562] (mod p_sm2)
+// The weird-looking 5*i modifications come in because we are doing
+// 64-bit word-sized Montgomery reductions at each stage, which is
+// 5 bits more than the 59-bit requirement to keep things stable.
+// After the 10th and last iteration and sign adjustment, when
+// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e.
+// x * u == 2^512 as required.
+
+        stp     xzr, xzr, [u]
+        stp     xzr, xzr, [u+2*N]
+
+// The starting constant 2^562 mod p_sm2 is
+// 0x0018000000040000:0x00040000000c0000:0x000bfffffff80000:0x000c000000100000
+// where colons separate 64-bit subwords, least significant at the right.
+// These each need a couple of instructions to create  on ARM
+
+        mov     x10, #0x0000000000100000
+        orr     x10, x10, #0x000c000000000000
+        mov     x11, #0x000c000000000000
+        sub     x11, x11, #0x80000
+        stp     x10, x11, [v]
+        mov     x12, #0x0004000000000000
+        orr     x12, x12, #0x00000000000c0000
+        mov     x13, #0x0018000000000000
+        orr     x13, x13, #0x0000000000040000
+        stp     x12, x13, [v+2*N]
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special tenth iteration after a uniform
+// first 9.
+
+        mov     i, #10
+        mov     d, #1
+        b       bignum_montinv_sm2_midloop
+
+bignum_montinv_sm2_loop:
+
+// Separate the matrix elements into sign-magnitude pairs
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in stable registers for the [u,v] part and do [f,g] first.
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+        and     x0, m10, s10
+        and     x1, m11, s11
+        add     car1, x0, x1
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        ldr     x7, [f]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [g]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Digit 1 of [f,g]
+
+        ldr     x7, [f+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [g+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        adc     x6, x6, x1
+        extr    x4, x2, x4, #59
+        str     x4, [f]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        adc     x4, x4, x1
+        extr    x5, x3, x5, #59
+        str     x5, [g]
+
+// Digit 2 of [f,g]
+
+        ldr     x7, [f+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [g+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        adc     x5, x5, x1
+        extr    x2, x6, x2, #59
+        str     x2, [f+N]
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        adc     x2, x2, x1
+        extr    x3, x4, x3, #59
+        str     x3, [g+N]
+
+// Digits 3 and 4 of [f,g]
+
+        ldr     x7, [f+3*N]
+        eor     x1, x7, s00
+        ldr     x23, [f+4*N]
+        eor     x3, x23, s00
+        and     x3, x3, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [g+3*N]
+        eor     x1, x8, s01
+        ldr     x24, [g+4*N]
+        eor     x0, x24, s01
+        and     x0, x0, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        extr    x6, x5, x6, #59
+        str     x6, [f+2*N]
+        extr    x5, x3, x5, #59
+        str     x5, [f+3*N]
+        asr     x3, x3, #59
+        str     x3, [f+4*N]
+
+        eor     x1, x7, s10
+        eor     x5, x23, s10
+        and     x5, x5, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        eor     x0, x24, s11
+        and     x0, x0, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        extr    x4, x2, x4, #59
+        str     x4, [g+2*N]
+        extr    x2, x5, x2, #59
+        str     x2, [g+3*N]
+        asr     x5, x5, #59
+        str     x5, [g+4*N]
+
+// Now the computation of the updated u and v values and their
+// Montgomery reductions. A very similar accumulation except that
+// the top words of u and v are unsigned and we don't shift.
+//
+// Digit 0 of [u,v]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x5, car1, x0
+        adc     x3, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x5, x5, x0
+        str     x5, [v]
+        adc     x3, x3, x1
+
+// Digit 1 of [u,v]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x3, x3, x0
+        adc     x4, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x3, x3, x0
+        str     x3, [v+N]
+        adc     x4, x4, x1
+
+// Digit 2 of [u,v]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+        eor     x1, x7, s10
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x4, x4, x0
+        adc     x2, xzr, x1
+        eor     x1, x8, s11
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x4, x4, x0
+        str     x4, [v+2*N]
+        adc     x2, x2, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Montgomery reduction of u
+
+        ldp     x0, x1, [u]
+        ldr     x6, [u+2*N]
+        amontred(x3,x5,x6,x1,x0, x24,x10,x11,x14)
+        stp     x1, x6, [u]
+        stp     x5, x3, [u+16]
+
+// Digits 3 and 4 of v (top is unsigned)
+
+        eor     x1, x7, s10
+        and     x5, s10, m10
+        neg     x5, x5
+        mul     x0, x1, m10
+        umulh   x1, x1, m10
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+        eor     x1, x8, s11
+        and     x0, s11, m11
+        sub     x5, x5, x0
+        mul     x0, x1, m11
+        umulh   x1, x1, m11
+        adds    x2, x2, x0
+        adc     x5, x5, x1
+
+// Montgomery reduction of v
+
+        ldp     x0, x1, [v]
+        ldr     x3, [v+2*N]
+        amontred(x5,x2,x3,x1,x0, x24,x10,x11,x14)
+        stp     x1, x3, [v]
+        stp     x2, x5, [v+16]
+
+bignum_montinv_sm2_midloop:
+
+        mov     x1, d
+        ldr     x2, [f]
+        ldr     x3, [g]
+        divstep59()
+        mov     d, x1
+
+// Next iteration
+
+        subs    i, i, #1
+        bne     bignum_montinv_sm2_loop
+
+// The 10th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        ldr     x0, [f]
+        ldr     x1, [g]
+        mul     x0, x0, m00
+        madd    x1, x1, m01, x0
+        asr     x0, x1, #63
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x *  2^{-512} [u,v] (mod p_sm2)
+// we want to flip the sign of u according to that of f.
+
+        cmp     m00, xzr
+        csetm   s00, mi
+        cneg    m00, m00, mi
+        eor     s00, s00, x0
+
+        cmp     m01, xzr
+        csetm   s01, mi
+        cneg    m01, m01, mi
+        eor     s01, s01, x0
+
+        cmp     m10, xzr
+        csetm   s10, mi
+        cneg    m10, m10, mi
+        eor     s10, s10, x0
+
+        cmp     m11, xzr
+        csetm   s11, mi
+        cneg    m11, m11, mi
+        eor     s11, s11, x0
+
+// Adjust the initial value to allow for complement instead of negation
+
+        and     x0, m00, s00
+        and     x1, m01, s01
+        add     car0, x0, x1
+
+// Digit 0 of [u]
+
+        ldr     x7, [u]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x4, car0, x0
+        adc     x2, xzr, x1
+        ldr     x8, [v]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x4, x4, x0
+        str     x4, [u]
+        adc     x2, x2, x1
+
+// Digit 1 of [u]
+
+        ldr     x7, [u+N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x2, x2, x0
+        adc     x6, xzr, x1
+        ldr     x8, [v+N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x2, x2, x0
+        str     x2, [u+N]
+        adc     x6, x6, x1
+
+// Digit 2 of [u]
+
+        ldr     x7, [u+2*N]
+        eor     x1, x7, s00
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x6, x6, x0
+        adc     x5, xzr, x1
+        ldr     x8, [v+2*N]
+        eor     x1, x8, s01
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x6, x6, x0
+        str     x6, [u+2*N]
+        adc     x5, x5, x1
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        ldr     x7, [u+3*N]
+        eor     x1, x7, s00
+        and     x3, s00, m00
+        neg     x3, x3
+        mul     x0, x1, m00
+        umulh   x1, x1, m00
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+        ldr     x8, [v+3*N]
+        eor     x1, x8, s01
+        and     x0, s01, m01
+        sub     x3, x3, x0
+        mul     x0, x1, m01
+        umulh   x1, x1, m01
+        adds    x5, x5, x0
+        adc     x3, x3, x1
+
+// Montgomery reduction of u. This needs to be strict not "almost"
+// so it is followed by an optional subtraction of p_sm2
+
+        ldp     x0, x1, [u]
+        ldr     x2, [u+2*N]
+        amontred(x3,x5,x2,x1,x0, x24,x10,x11,x14)
+
+        mov     x10, #0xffffffffffffffff
+        subs    x10, x1, #-1
+        mov     x11, #0xffffffff00000000
+        sbcs    x11, x2, x11
+        mov     x13, #0xfffffffeffffffff
+        adcs    x12, x5, xzr
+        sbcs    x13, x3, x13
+
+        csel    x10, x1, x10, cc
+        csel    x11, x2, x11, cc
+        csel    x12, x5, x12, cc
+        csel    x13, x3, x13, cc
+
+// Store it back to the final output
+
+        stp     x10, x11, [res]
+        stp     x12, x13, [res, #16]
+
+// Restore stack and registers
+
+        add     sp, sp, NSPACE
+        ldp     x23, x24, [sp], 16
+        ldp     x21, x22, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2.S
new file mode 100644
index 00000000000..f2595cb7ec5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2.S
@@ -0,0 +1,267 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_sm2, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_sm2 (in particular this is true if we are in
+// the "usual" case x < p_sm2 and y < p_sm2).
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_sm2)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z)
+// c,h,l,t should all be different
+// t,h should not overlap w,z
+// ---------------------------------------------------------------------------
+
+#define muldiffn(c,h,l, t, x,y, w,z)    \
+        subs    t, x, y __LF               \
+        cneg    t, t, cc __LF              \
+        csetm   c, cc __LF                 \
+        subs    h, w, z __LF               \
+        cneg    h, h, cc __LF              \
+        mul     l, t, h __LF               \
+        umulh   h, t, h __LF               \
+        cinv    c, c, cc __LF              \
+        eor     l, l, c __LF               \
+        eor     h, h, c
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as
+// temporaries. It is fine for d4 to be the same register as d0,
+// and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0)                               \
+/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0             */ \
+        lsl     t2, d0, #32 __LF                                               \
+        lsr     t3, d0, #32 __LF                                               \
+        subs    t0, t2, d0 __LF                                                \
+        sbc     t1, t3, xzr __LF                                               \
+/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0]                    */ \
+        subs    d1, d1, t0 __LF                                                \
+        sbcs    d2, d2, t1 __LF                                                \
+        sbcs    d3, d3, t2 __LF                                                \
+        sbc     d4, d0, t3
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define s0 x11
+#define s1 x12
+#define s2 x13
+#define s3 x14
+#define t0 x15
+#define t1 x16
+#define t2 x17
+#define t3 x1
+#define s4 x2
+
+S2N_BN_SYMBOL(bignum_montmul_sm2):
+
+// Load in all words of both inputs
+
+        ldp     a0, a1, [x1]
+        ldp     a2, a3, [x1, #16]
+        ldp     b0, b1, [x2]
+        ldp     b2, b3, [x2, #16]
+
+// Multiply low halves with a 2x2->4 ADK multiplier as L = [s3;s2;s1;s0]
+
+        mul     s0, a0, b0
+        mul     s2, a1, b1
+        umulh   s1, a0, b0
+        adds    t1, s0, s2
+        umulh   s3, a1, b1
+        adcs    t2, s1, s3
+        adcs    s3, s3, xzr
+        adds    s1, s1, t1
+        adcs    s2, s2, t2
+        adcs    s3, s3, xzr
+        muldiffn(t3,t2,t1, t0, a0,a1, b1,b0)
+        adds    xzr, t3, #1
+        adcs    s1, s1, t1
+        adcs    s2, s2, t2
+        adc     s3, s3, t3
+
+// Perform two "short" Montgomery steps on the low product to
+// get a modified low result L' = [s1;s0;s3;s2]
+// This shifts it to an offset compatible with middle terms
+// Stash the result L' temporarily in the output buffer to avoid
+// using additional registers.
+
+        montreds(s0,s3,s2,s1,s0, t0,t1,t2,t3)
+        montreds(s1,s0,s3,s2,s1, t0,t1,t2,t3)
+
+        stp     s2, s3, [x0]
+        stp     s0, s1, [x0, #16]
+
+// Multiply high halves with a 2x2->4 ADK multiplier as H = [s3;s2;s1;s0]
+
+        mul     s0, a2, b2
+        mul     s2, a3, b3
+        umulh   s1, a2, b2
+        adds    t1, s0, s2
+        umulh   s3, a3, b3
+        adcs    t2, s1, s3
+        adcs    s3, s3, xzr
+        adds    s1, s1, t1
+        adcs    s2, s2, t2
+        adcs    s3, s3, xzr
+        muldiffn(t3,t2,t1, t0, a2,a3, b3,b2)
+        adds    xzr, t3, #1
+        adcs    s1, s1, t1
+        adcs    s2, s2, t2
+        adc     s3, s3, t3
+
+// Compute sign-magnitude a2,[a1,a0] = x_hi - x_lo
+
+        subs    a0, a2, a0
+        sbcs    a1, a3, a1
+        sbc     a2, xzr, xzr
+        adds    xzr, a2, #1
+        eor     a0, a0, a2
+        adcs    a0, a0, xzr
+        eor     a1, a1, a2
+        adcs    a1, a1, xzr
+
+// Compute sign-magnitude b2,[b1,b0] = y_lo - y_hi
+
+        subs    b0, b0, b2
+        sbcs    b1, b1, b3
+        sbc     b2, xzr, xzr
+        adds    xzr, b2, #1
+        eor     b0, b0, b2
+        adcs    b0, b0, xzr
+        eor     b1, b1, b2
+        adcs    b1, b1, xzr
+
+// Save the correct sign for the sub-product in b3
+
+        eor     b3, a2, b2
+
+// Add the high H to the modified low term L' as H + L' = [s4;b2;a2;t3;t0]
+
+        ldp     t0, t3, [x0]
+        adds    t0, s0, t0
+        adcs    t3, s1, t3
+        ldp     a2, b2, [x0, #16]
+        adcs    a2, s2, a2
+        adcs    b2, s3, b2
+        adc     s4, xzr, xzr
+
+// Multiply with yet a third 2x2->4 ADK multiplier for complex mid-term M
+
+        mul     s0, a0, b0
+        mul     s2, a1, b1
+        umulh   s1, a0, b0
+        adds    t1, s0, s2
+        umulh   s3, a1, b1
+        adcs    t2, s1, s3
+        adcs    s3, s3, xzr
+        adds    s1, s1, t1
+        adcs    s2, s2, t2
+        adcs    s3, s3, xzr
+        muldiffn(a1,t2,t1, a0, a0,a1, b1,b0)
+        adds    xzr, a1, #1
+        adcs    s1, s1, t1
+        adcs    s2, s2, t2
+        adc     s3, s3, a1
+
+// Set up a sign-modified version of the mid-product in a long accumulator
+// as [b3;a1;a0;s3;s2;s1;s0], adding in the H + L' term once with
+// zero offset as this signed value is created
+
+        adds    xzr, b3, #1
+        eor     s0, s0, b3
+        adcs    s0, s0, t0
+        eor     s1, s1, b3
+        adcs    s1, s1, t3
+        eor     s2, s2, b3
+        adcs    s2, s2, a2
+        eor     s3, s3, b3
+        adcs    s3, s3, b2
+        adcs    a0, s4, b3
+        adcs    a1, b3, xzr
+        adc     b3, b3, xzr
+
+// Add in the stashed H + L' term an offset of 2 words as well
+
+        adds    s2, s2, t0
+        adcs    s3, s3, t3
+        adcs    a0, a0, a2
+        adcs    a1, a1, b2
+        adc     b3, b3, s4
+
+// Do two more Montgomery steps on the composed term
+// Net pre-reduct is in [b3;a1;a0;s3;s2]
+
+        montreds(s0,s3,s2,s1,s0, t0,t1,t2,t3)
+        montreds(s1,s0,s3,s2,s1, t0,t1,t2,t3)
+
+        adds    a0, a0, s0
+        adcs    a1, a1, s1
+        adc     b3, b3, xzr
+
+// Because of the way we added L' in two places, we can overspill by
+// more than usual in Montgomery, with the result being only known to
+// be < 3 * p_sm2, not the usual < 2 * p_sm2. So now we do a more
+// elaborate final correction in the style of bignum_cmul_sm2, though
+// we can use much simpler quotient estimation logic (q = h + 1) and
+// slightly more direct accumulation of p_sm2 * q.
+
+#define d0 s2
+#define d1 s3
+#define d2 a0
+#define d3 a1
+#define h b3
+
+#define q s4
+#define c b0
+
+        add     q, h, #1
+        lsl     t0, q, #32
+        sub     t1, t0, q
+        adds    d0, d0, q
+        adcs    d1, d1, t1
+        adcs    d2, d2, xzr
+        adcs    d3, d3, t0
+        csetm   c, cc
+        adds    d0, d0, c
+        and     t1, c, #0xffffffff00000000
+        adcs    d1, d1, t1
+        adcs    d2, d2, c
+        and     t0, c, #0xfffffffeffffffff
+        adc     d3, d3, t0
+
+// Finally store the result
+
+        stp     d0, d1, [x0]
+        stp     d2, d3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2_alt.S
new file mode 100644
index 00000000000..a57f6c140d1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2_alt.S
@@ -0,0 +1,204 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_sm2_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_sm2, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_sm2 (in particular this is true if we are in
+// the "usual" case x < p_sm2 and y < p_sm2).
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_sm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_sm2_alt)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as
+// temporaries. It is fine for d4 to be the same register as d0,
+// and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0)                               \
+/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0             */ \
+        lsl     t2, d0, #32 __LF                                               \
+        lsr     t3, d0, #32 __LF                                               \
+        subs    t0, t2, d0 __LF                                                \
+        sbc     t1, t3, xzr __LF                                               \
+/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0]                    */ \
+        subs    d1, d1, t0 __LF                                                \
+        sbcs    d2, d2, t1 __LF                                                \
+        sbcs    d3, d3, t2 __LF                                                \
+        sbc     d4, d0, t3
+
+#define z x0
+#define x x1
+#define y x2
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+#define b0 x7
+#define b1 x8
+#define b2 x9
+#define b3 x10
+
+#define l x11
+
+#define u0 x12
+#define u1 x13
+#define u2 x14
+#define u3 x15
+#define u4 x16
+
+// These alias to the input arguments when no longer needed
+
+#define u5 a0
+#define u6 a1
+#define u7 a2
+#define h a3
+
+S2N_BN_SYMBOL(bignum_montmul_sm2_alt):
+
+// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0]
+
+        ldp     a0, a1, [x]
+        ldp     b0, b1, [y]
+
+        mul     u0, a0, b0
+        umulh   u1, a0, b0
+        mul     l, a0, b1
+        umulh   u2, a0, b1
+        adds    u1, u1, l
+
+        ldp     b2, b3, [y, #16]
+
+        mul     l, a0, b2
+        umulh   u3, a0, b2
+        adcs    u2, u2, l
+
+        mul     l, a0, b3
+        umulh   u4, a0, b3
+        adcs    u3, u3, l
+        adc     u4, u4, xzr
+
+        ldp     a2, a3, [x, #16]
+
+// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0]
+
+        mul     l, a1, b0
+        adds    u1, u1, l
+        mul     l, a1, b1
+        adcs    u2, u2, l
+        mul     l, a1, b2
+        adcs    u3, u3, l
+        mul     l, a1, b3
+        adcs    u4, u4, l
+        umulh   u5, a1, b3
+        adc     u5, u5, xzr
+
+        umulh   l, a1, b0
+        adds    u2, u2, l
+        umulh   l, a1, b1
+        adcs    u3, u3, l
+        umulh   l, a1, b2
+        adcs    u4, u4, l
+        adc     u5, u5, xzr
+
+// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0]
+
+        mul     l, a2, b0
+        adds    u2, u2, l
+        mul     l, a2, b1
+        adcs    u3, u3, l
+        mul     l, a2, b2
+        adcs    u4, u4, l
+        mul     l, a2, b3
+        adcs    u5, u5, l
+        umulh   u6, a2, b3
+        adc     u6, u6, xzr
+
+        umulh   l, a2, b0
+        adds    u3, u3, l
+        umulh   l, a2, b1
+        adcs    u4, u4, l
+        umulh   l, a2, b2
+        adcs    u5, u5, l
+        adc     u6, u6, xzr
+
+// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0]
+
+        mul     l, a3, b0
+        adds    u3, u3, l
+        mul     l, a3, b1
+        adcs    u4, u4, l
+        mul     l, a3, b2
+        adcs    u5, u5, l
+        mul     l, a3, b3
+        adcs    u6, u6, l
+        umulh   u7, a3, b3
+        adc     u7, u7, xzr
+
+        umulh   l, a3, b0
+        adds    u4, u4, l
+        umulh   l, a3, b1
+        adcs    u5, u5, l
+        umulh   l, a3, b2
+        adcs    u6, u6, l
+        adc     u7, u7, xzr
+
+// Perform 4 Montgomery steps to rotate the lower half
+
+        montreds(u0,u3,u2,u1,u0, h,l,b0,b1)
+        montreds(u1,u0,u3,u2,u1, h,l,b0,b1)
+        montreds(u2,u1,u0,u3,u2, h,l,b0,b1)
+        montreds(u3,u2,u1,u0,u3, h,l,b0,b1)
+
+// Add high and low parts, catching carry in b1
+
+        adds    u0, u0, u4
+        adcs    u1, u1, u5
+        adcs    u2, u2, u6
+        adcs    u3, u3, u7
+        cset    b1, cs
+
+// Set [h;-1;l;-1] = p_sm2 and form [u7,u6,u5,u4] = [b1;u3;u2;u1;u0] - p_sm2
+
+        mov     l, #0xffffffff00000000
+        mov     h, #0xfffffffeffffffff
+
+        subs    u4, u0, #-1
+        sbcs    u5, u1, l
+        adcs    u6, u2, xzr
+        sbcs    u7, u3, h
+        sbcs    xzr, b1, xzr
+
+// Now CF is clear if the comparison carried so the original was fine
+// Otherwise take the form with p_sm2 subtracted.
+
+        csel    u0, u0, u4, cc
+        csel    u1, u1, u5, cc
+        csel    u2, u2, u6, cc
+        csel    u3, u3, u7, cc
+
+// Store back final result
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2.S
new file mode 100644
index 00000000000..3c715a176a1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2.S
@@ -0,0 +1,268 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_sm2, assuming x^2 <= 2^256 * p_sm2, which is
+// guaranteed in particular if x < p_sm2 initially (the "intended" case).
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_sm2)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z)
+// c,h,l,t should all be different
+// t,h should not overlap w,z
+// ---------------------------------------------------------------------------
+
+#define muldiffn(c,h,l, t, x,y, w,z)    \
+        subs    t, x, y __LF               \
+        cneg    t, t, cc __LF              \
+        csetm   c, cc __LF                 \
+        subs    h, w, z __LF               \
+        cneg    h, h, cc __LF              \
+        mul     l, t, h __LF               \
+        umulh   h, t, h __LF               \
+        cinv    c, c, cc __LF              \
+        eor     l, l, c __LF               \
+        eor     h, h, c
+
+// ---------------------------------------------------------------------------
+// Core one-step "end" Montgomery reduction macro. Takes input in
+// [d5;d4;d3;d2;d1;d0] and returns result in [d5;d4;d3;d2;d1], adding to
+// the existing [d4;d3;d2;d1], re-using d0 as a temporary internally as well
+// as t1, t2, t3, and initializing d5 from zero (hence "end").
+// ---------------------------------------------------------------------------
+
+#define montrede(d5, d4,d3,d2,d1,d0, t3,t2,t1,t0)                           \
+/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0             */ \
+        lsl     t2, d0, #32 __LF                                               \
+        lsr     t3, d0, #32 __LF                                               \
+        subs    t0, t2, d0 __LF                                                \
+        sbc     t1, t3, xzr __LF                                               \
+/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0]                    */ \
+        subs    d1, d1, t0 __LF                                                \
+        sbcs    d2, d2, t1 __LF                                                \
+        sbcs    d3, d3, t2 __LF                                                \
+        sbc     t0, d0, t3 __LF                                                \
+        adds    d4, d4, t0 __LF                                                \
+        adc     d5, xzr, xzr
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as
+// temporaries. It is fine for d4 to be the same register as d0,
+// and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0)                               \
+/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0             */ \
+        lsl     t2, d0, #32 __LF                                               \
+        lsr     t3, d0, #32 __LF                                               \
+        subs    t0, t2, d0 __LF                                                \
+        sbc     t1, t3, xzr __LF                                               \
+/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0]                    */ \
+        subs    d1, d1, t0 __LF                                                \
+        sbcs    d2, d2, t1 __LF                                                \
+        sbcs    d3, d3, t2 __LF                                                \
+        sbc     d4, d0, t3
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+
+#define c0 x6
+#define c1 x7
+#define c2 x8
+#define c3 x9
+#define c4 x10
+#define d1 x11
+#define d2 x12
+#define d3 x13
+#define d4 x14
+
+#define s0 x15
+#define s1 x16
+#define s2 x17
+#define s3 x1
+
+#define a0short w2
+#define a1short w3
+#define d1short w11
+
+S2N_BN_SYMBOL(bignum_montsqr_sm2):
+
+// Load in all words of the input
+
+        ldp     a0, a1, [x1]
+        ldp     a2, a3, [x1, #16]
+
+// Square the low half, getting a result in [s3;s2;s1;s0]
+// This uses 32x32->64 multiplications to reduce the number of UMULHs
+
+        umull   s0, a0short, a0short
+        lsr     d1, a0, #32
+        umull   s1, d1short, d1short
+        umull   d1, a0short, d1short
+        adds    s0, s0, d1, lsl #33
+        lsr     d1, d1, #31
+        adc     s1, s1, d1
+        umull   s2, a1short, a1short
+        lsr     d1, a1, #32
+        umull   s3, d1short, d1short
+        umull   d1, a1short, d1short
+        mul     d2, a0, a1
+        umulh   d3, a0, a1
+        adds    s2, s2, d1, lsl #33
+        lsr     d1, d1, #31
+        adc     s3, s3, d1
+        adds    d2, d2, d2
+        adcs    d3, d3, d3
+        adc     s3, s3, xzr
+        adds    s1, s1, d2
+        adcs    s2, s2, d3
+        adc     s3, s3, xzr
+
+// Perform two "short" Montgomery steps on the low square
+// This shifts it to an offset compatible with middle product
+
+        montreds(s0,s3,s2,s1,s0, d1,d2,d3,d4)
+
+        montreds(s1,s0,s3,s2,s1, d1,d2,d3,d4)
+
+// Compute cross-product with ADK 2x2->4 multiplier as [c3;c2;c1;c0]
+
+        mul     c0, a0, a2
+        mul     d4, a1, a3
+        umulh   c2, a0, a2
+        muldiffn(d3,d2,d1, c4, a0,a1, a3,a2)
+
+        adds    c1, c0, c2
+        adc     c2, c2, xzr
+
+        umulh   c3, a1, a3
+
+        adds    c1, c1, d4
+        adcs    c2, c2, c3
+        adc     c3, c3, xzr
+        adds    c2, c2, d4
+        adc     c3, c3, xzr
+
+        adds    xzr, d3, #1
+        adcs    c1, c1, d1
+        adcs    c2, c2, d2
+        adc     c3, c3, d3
+
+// Double it and add the Montgomerified low square
+
+        adds    c0, c0, c0
+        adcs    c1, c1, c1
+        adcs    c2, c2, c2
+        adcs    c3, c3, c3
+        adc     c4, xzr, xzr
+
+        adds    c0, c0, s2
+        adcs    c1, c1, s3
+        adcs    c2, c2, s0
+        adcs    c3, c3, s1
+        adc     c4, c4, xzr
+
+// Montgomery-reduce the combined low and middle term another twice
+
+        montrede(c0,c4,c3,c2,c1,c0, d1,d2,d3,d4)
+
+        montrede(c1,c0,c4,c3,c2,c1, d1,d2,d3,d4)
+
+// Our sum so far is in [c1,c0,c4,c3,c2]; choose more intuitive names
+
+#define r0 x8
+#define r1 x9
+#define r2 x10
+#define r3 x6
+#define c x7
+
+// Remind ourselves what else we can't destroy
+
+#define a2 x4
+#define a3 x5
+
+// So we can have these as temps
+
+#define t1 x11
+#define t2 x12
+#define t3 x13
+
+// Add in the pure squares 22 + 33
+
+        mul     t1, a2, a2
+        adds    r0, r0, t1
+        mul     t2, a3, a3
+        umulh   t1, a2, a2
+        adcs    r1, r1, t1
+        adcs    r2, r2, t2
+        umulh   t2, a3, a3
+        adcs    r3, r3, t2
+        adc     c, c, xzr
+
+// Construct the 23 term, double and add it in
+
+        mul     t1, a2, a3
+        umulh   t2, a2, a3
+        adds    t1, t1, t1
+        adcs    t2, t2, t2
+        adc     t3, xzr, xzr
+
+        adds    r1, r1, t1
+        adcs    r2, r2, t2
+        adcs    r3, r3, t3
+        adcs    c, c, xzr
+
+// We know, writing B = 2^{4*64} that the full implicit result is
+// B^2 c <= z + (B - 1) * p < B * p + (B - 1) * p < 2 * B * p,
+// so the top half is certainly < 2 * p. If c = 1 already, we know
+// subtracting p will give the reduced modulus. But now we do a
+// subtraction-comparison to catch cases where the residue is >= p.
+// The constants are such that [t3;0;t1;-1] = p_256.
+
+#define t0      x5
+
+// Set CF (because of inversion) iff (0,p_256) <= (c,r3,r2,r1,r0)
+
+        mov     t1, #0xffffffff00000000
+        subs    t0, r0, #-1
+        sbcs    t1, r1, t1
+        mov     t3, #0xfffffffeffffffff
+        adcs    t2, r2, xzr
+        sbcs    t3, r3, t3
+        sbcs    xzr, c, xzr
+
+// Select final output accordingly
+
+        csel    r0, t0, r0, cs
+        csel    r1, t1, r1, cs
+        csel    r2, t2, r2, cs
+        csel    r3, t3, r3, cs
+
+// Store things back in place
+
+        stp     r0, r1, [x0]
+        stp     r2, r3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2_alt.S
new file mode 100644
index 00000000000..f2e871b3857
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2_alt.S
@@ -0,0 +1,178 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_sm2_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_sm2, assuming x^2 <= 2^256 * p_sm2, which is
+// guaranteed in particular if x < p_sm2 initially (the "intended" case).
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_sm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_sm2_alt)
+        .text
+        .balign 4
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as
+// temporaries. It is fine for d4 to be the same register as d0,
+// and it often is.
+// ---------------------------------------------------------------------------
+
+#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0)                               \
+/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0             */ \
+        lsl     t2, d0, #32 __LF                                               \
+        lsr     t3, d0, #32 __LF                                               \
+        subs    t0, t2, d0 __LF                                                \
+        sbc     t1, t3, xzr __LF                                               \
+/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0]                    */ \
+        subs    d1, d1, t0 __LF                                                \
+        sbcs    d2, d2, t1 __LF                                                \
+        sbcs    d3, d3, t2 __LF                                                \
+        sbc     d4, d0, t3
+
+#define z x0
+#define x x1
+
+#define a0 x2
+#define a1 x3
+#define a2 x4
+#define a3 x5
+
+#define l x6
+#define h x7
+
+#define u0 x8
+#define u1 x9
+#define u2 x10
+#define u3 x11
+#define u4 x12
+#define u5 x13
+#define u6 x14
+
+// This one is the same as h, which is safe with this computation sequence
+
+#define u7 h
+
+S2N_BN_SYMBOL(bignum_montsqr_sm2_alt):
+
+// Load all the elements, set up an initial window [u6;...u1] = [23;03;01]
+// and chain in the addition of 02 + 12 + 13 (no carry-out is possible).
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        ldp     a0, a1, [x]
+
+        mul     u1, a0, a1
+        umulh   u2, a0, a1
+
+        ldp     a2, a3, [x, #16]
+
+        mul     u3, a0, a3
+        umulh   u4, a0, a3
+
+        mul     l, a0, a2
+        umulh   h, a0, a2
+        adds    u2, u2, l
+
+        adcs    u3, u3, h
+        mul     l, a1, a2
+        umulh   h, a1, a2
+        adc     h, h, xzr
+        adds    u3, u3, l
+
+        mul     u5, a2, a3
+        umulh   u6, a2, a3
+
+        adcs    u4, u4, h
+        mul     l, a1, a3
+        umulh   h, a1, a3
+        adc     h, h, xzr
+        adds    u4, u4, l
+
+        adcs    u5, u5, h
+        adc     u6, u6, xzr
+
+// Now just double it; this simple approach seems to work better than extr
+
+        adds    u1, u1, u1
+        adcs    u2, u2, u2
+        adcs    u3, u3, u3
+        adcs    u4, u4, u4
+        adcs    u5, u5, u5
+        adcs    u6, u6, u6
+        cset    u7, cs
+
+// Add the homogeneous terms 00 + 11 + 22 + 33
+
+        umulh   l, a0, a0
+        mul     u0, a0, a0
+        adds    u1, u1, l
+
+        mul     l, a1, a1
+        adcs    u2, u2, l
+        umulh   l, a1, a1
+        adcs    u3, u3, l
+
+        mul     l, a2, a2
+        adcs    u4, u4, l
+        umulh   l, a2, a2
+        adcs    u5, u5, l
+
+        mul     l, a3, a3
+        adcs    u6, u6, l
+        umulh   l, a3, a3
+        adc     u7, u7, l
+
+// Squaring complete. Perform 4 Montgomery steps to rotate the lower half
+
+        montreds(u0,u3,u2,u1,u0, a3,a2,a1,a0)
+        montreds(u1,u0,u3,u2,u1, a3,a2,a1,a0)
+        montreds(u2,u1,u0,u3,u2, a3,a2,a1,a0)
+        montreds(u3,u2,u1,u0,u3, a3,a2,a1,a0)
+
+// Add high and low parts, catching carry in a0
+
+        adds    u0, u0, u4
+        adcs    u1, u1, u5
+        adcs    u2, u2, u6
+        adcs    u3, u3, u7
+        cset    a0, cs
+
+// Set [a3;-1;a1;-1] = p_sm2 and form [u7,u6,u5,u4] = [a0;u3;u2;u1;u0] - p_sm2
+
+        mov     a1, #0xffffffff00000000
+        mov     a3, #0xfffffffeffffffff
+
+        subs    u4, u0, #-1
+        sbcs    u5, u1, a1
+        adcs    u6, u2, xzr
+        sbcs    u7, u3, a3
+        sbcs    xzr, a0, xzr
+
+// Now CF is clear if the comparison carried so the original was fine
+// Otherwise take the form with p_sm2 subtracted.
+
+        csel    u0, u0, u4, cc
+        csel    u1, u1, u5, cc
+        csel    u2, u2, u6, cc
+        csel    u3, u3, u7, cc
+
+// Store back final result
+
+        stp     u0, u1, [z]
+        stp     u2, u3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_neg_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_neg_sm2.S
new file mode 100644
index 00000000000..e91f73ddfce
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_neg_sm2.S
@@ -0,0 +1,66 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_neg_sm2 (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_sm2)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define p x2
+#define t x3
+
+#define d0 x4
+#define d1 x5
+#define d2 x6
+#define d3 x7
+
+S2N_BN_SYMBOL(bignum_neg_sm2):
+
+// Load the 4 digits of x
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Set a bitmask p for the input being nonzero, so that we avoid doing
+// -0 = p_sm2 and hence maintain strict modular reduction
+
+        orr     t, d0, d1
+        orr     p, d2, d3
+        orr     p, p, t
+        cmp     p, #0
+        csetm   p, ne
+
+// Mask nontrivial words of p_sm2 = [n3;-1;n1;-1] and subtract
+
+        subs    d0, p, d0
+        and     t, p, #0xffffffff00000000
+        sbcs    d1, t, d1
+        sbcs    d2, p, d2
+        and     t, p, #0xfffffffeffffffff
+        sbc     d3, t, d3
+
+// Write back the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+// Return
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_optneg_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_optneg_sm2.S
new file mode 100644
index 00000000000..5b977fda1f9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_optneg_sm2.S
@@ -0,0 +1,83 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or
+// z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+//
+//    extern void bignum_optneg_sm2
+//     (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = p, X2 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_sm2)
+        .text
+        .balign 4
+
+#define z x0
+#define p x1
+#define x x2
+
+#define d0 x3
+#define d1 x4
+#define d2 x5
+#define d3 x6
+#define n0 x7
+#define n1 x8
+#define n2 x9
+#define n3 x10
+
+S2N_BN_SYMBOL(bignum_optneg_sm2):
+
+// Load the 4 digits of x
+
+        ldp     d0, d1, [x]
+        ldp     d2, d3, [x, #16]
+
+// Adjust p by zeroing it if the input is zero (to avoid giving -0 = p, which
+// is not strictly reduced even though it's correct modulo p)
+
+        orr     n0, d0, d1
+        orr     n1, d2, d3
+        orr     n2, n0, n1
+        cmp     n2, #0
+        csel    p, xzr, p, eq
+
+// Load the nontrivial words of p_sm2 = [n3;-1;n1;-1]
+
+        mov     n2, #0xffffffffffffffff
+        mov     n1, #0xffffffff00000000
+        mov     n3, #0xfffffffeffffffff
+
+// Do the subtraction, which by hypothesis does not underflow
+
+        subs    n0, n2, d0
+        sbcs    n1, n1, d1
+        sbcs    n2, n2, d2
+        sbc     n3, n3, d3
+
+// Set condition code if original x is nonzero and p was nonzero
+
+        cmp     p, #0
+
+// Hence multiplex and write back
+
+        csel    n0, n0, d0, ne
+        csel    n1, n1, d1, ne
+        csel    n2, n2, d2, ne
+        csel    n3, n3, d3, ne
+
+        stp     n0, n1, [z]
+        stp     n2, n3, [z, #16]
+
+// Return
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_sub_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_sub_sm2.S
new file mode 100644
index 00000000000..38467a4fd05
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_sub_sm2.S
@@ -0,0 +1,67 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo p_sm2, z := (x - y) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_sub_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x, X2 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_sm2)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+#define y x2
+#define c x3
+#define l x4
+#define d0 x5
+#define d1 x6
+#define d2 x7
+#define d3 x8
+
+
+S2N_BN_SYMBOL(bignum_sub_sm2):
+
+// First just subtract the numbers as [d3; d2; d1; d0]
+// Set a mask based on (inverted) carry indicating x < y = correction is needed
+
+        ldp     d0, d1, [x]
+        ldp     l, c, [y]
+        subs    d0, d0, l
+        sbcs    d1, d1, c
+        ldp     d2, d3, [x, #16]
+        ldp     l, c, [y, #16]
+        sbcs    d2, d2, l
+        sbcs    d3, d3, c
+
+// Create a mask for the condition x < y, when we need to correct
+
+        csetm   c, cc
+
+// Now correct by adding masked p_sm2
+
+        adds    d0, d0, c
+        and     l, c, #0xffffffff00000000
+        adcs    d1, d1, l
+        adcs    d2, d2, c
+        and     l, c, #0xfffffffeffffffff
+        adc     d3, d3, l
+
+// Store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_tomont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_tomont_sm2.S
new file mode 100644
index 00000000000..d5bfb407e68
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_tomont_sm2.S
@@ -0,0 +1,108 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert to Montgomery form z := (2^256 * x) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_tomont_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_sm2)
+        .text
+        .balign 4
+
+// ----------------------------------------------------------------------------
+// Core "x |-> (2^64 * x) mod p_sm2" macro, with x assumed to be < p_sm2.
+// We write it as a macro to be repeated instead of using .rep in assembler.
+// The code here is very similar to the core of bignum_mod_sm2, just
+// implicitly inserting zeros instead of fresh digits.
+// ----------------------------------------------------------------------------
+
+#define modstep_sm2()                                                   \
+/* Writing the input, with a lowest zero digit appended, as     */      \
+/* z = 2^256 * d3 + 2^192 * d2 + t, quotient approximation is   */      \
+/* MIN ((d3 * (1 + 2^32 + 2^64) + d2 + 2^64) >> 64) (2^64 - 1)  */      \
+        adds    t3, d2, d3 __LF                                            \
+        mov     t2, #1 __LF                                                \
+        adc     t1, d3, t2 __LF                                            \
+        add     t2, d3, t3, lsr #32 __LF                                   \
+        adds    q, t1, t2, lsr #32 __LF                                    \
+        cinv    q, q, cs __LF                                              \
+/* Let t3 = q<<32 and t4 = q>>32 then [t2;t1] = 2^32 * q - q    */      \
+        lsl     t3, q, #32 __LF                                            \
+        subs    t1, t3, q __LF                                             \
+        lsr     t4, q, #32 __LF                                            \
+        sbc     t2, t4, xzr __LF                                           \
+/* Do the basic correction [t4;t3;t2;t1;q] = 2^256 * x - q * p  */      \
+        adds    t1, t1, d0 __LF                                            \
+        sub     d3, d3, q __LF                                             \
+        adcs    t2, t2, d1 __LF                                            \
+        adcs    t3, t3, d2 __LF                                            \
+        adc     t4, t4, d3 __LF                                            \
+/* Use top word as mask to correct */                                   \
+        adds    d0, q, t4 __LF                                             \
+        and     t0, t4, #0xffffffff00000000 __LF                           \
+        adcs    d1, t1, t0 __LF                                            \
+        adcs    d2, t2, t4 __LF                                            \
+        and     t0, t4, #0xfffffffeffffffff __LF                           \
+        adc     d3, t3, t0
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+
+#define t1 x6
+#define t2 x7
+#define t3 x8
+
+#define t4 x9
+
+#define q x1
+#define t0 x1
+
+S2N_BN_SYMBOL(bignum_tomont_sm2):
+
+// Load the input
+
+        ldp     d0, d1, [x1]
+        ldp     d2, d3, [x1, #16]
+
+// Do an initial reduction to make sure this is < p_sm2, using just
+// a copy of the bignum_mod_sm2_4 code. This is needed to set up the
+// invariant "input < p_sm2" for the main modular reduction steps.
+
+        subs    t0, d0, #-1
+        mov     t1, #0xffffffff00000000
+        sbcs    t1, d1, t1
+        adcs    t2, d2, xzr
+        mov     t3, #0xfffffffeffffffff
+        sbcs    t3, d3, t3
+        csel    d0, d0, t0, cc
+        csel    d1, d1, t1, cc
+        csel    d2, d2, t2, cc
+        csel    d3, d3, t3, cc
+
+// Now do 4 iterations of a basic x |-> (2^64 * x) mod p_sm2 step.
+
+        modstep_sm2()
+        modstep_sm2()
+        modstep_sm2()
+        modstep_sm2()
+
+// Store the result and return
+
+        stp     d0, d1, [x0]
+        stp     d2, d3, [x0, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_triple_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_triple_sm2.S
new file mode 100644
index 00000000000..3811fc9ef99
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_triple_sm2.S
@@ -0,0 +1,107 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Triple modulo p_sm2, z := (3 * x) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_triple_sm2
+//      (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The input x can be any 4-digit bignum, not necessarily reduced modulo p_sm2,
+// and the result is always fully reduced, i.e. z = (3 * x) mod p_sm2.
+//
+// Standard ARM ABI: X0 = z, X1 = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_sm2)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_sm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_sm2_alt)
+        .text
+        .balign 4
+
+#define z x0
+#define x x1
+
+#define d0 x2
+#define d1 x3
+#define d2 x4
+#define d3 x5
+#define h x6
+
+// Slightly offset aliases for the d_i for readability.
+
+#define a0 x3
+#define a1 x4
+#define a2 x5
+#define a3 x6
+
+// More aliases for the same thing at different stages
+
+#define q x6
+#define c x6
+
+// Other temporary variables
+
+#define t0 x7
+#define t1 x8
+
+S2N_BN_SYMBOL(bignum_triple_sm2):
+
+S2N_BN_SYMBOL(bignum_triple_sm2_alt):
+
+// Load the inputs
+
+        ldp     a0, a1, [x]
+        ldp     a2, a3, [x, #16]
+
+// First do the multiplication by 3, getting z = [h; d3; ...; d0]
+
+        lsl     d0, a0, #1
+        adds    d0, d0, a0
+        extr    d1, a1, a0, #63
+        adcs    d1, d1, a1
+        extr    d2, a2, a1, #63
+        adcs    d2, d2, a2
+        extr    d3, a3, a2, #63
+        adcs    d3, d3, a3
+        lsr     h, a3, #63
+        adc     h, h, xzr
+
+// For this limited range a simple quotient estimate of q = h + 1 works, where
+// h = floor(z / 2^256). Then -p_sm2 <= z - q * p_sm2 < p_sm2, so we just need
+// to subtract q * p_sm2 and then if that's negative, add back p_sm2.
+
+        add     q, h, #1
+
+// Initial subtraction of z - q * p_sm2, with bitmask c for the carry
+
+        lsl     t0, q, #32
+        sub     t1, t0, q
+        adds    d0, d0, q
+        adcs    d1, d1, t1
+        adcs    d2, d2, xzr
+        adcs    d3, d3, t0
+        csetm   c, cc
+
+// Use the bitmask c for final masked addition of p_sm2.
+
+        adds    d0, d0, c
+        and     t1, c, #0xffffffff00000000
+        adcs    d1, d1, t1
+        adcs    d2, d2, c
+        and     t0, c, #0xfffffffeffffffff
+        adc     d3, d3, t0
+
+// Finally store the result
+
+        stp     d0, d1, [z]
+        stp     d2, d3, [z, #16]
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd.S
new file mode 100644
index 00000000000..ac916cc547b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd.S
@@ -0,0 +1,540 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjadd)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x17
+#define input_x x19
+#define input_y x20
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+#define z_2 input_y, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define x1a sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define z2sq sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define y1a sp, #(NUMSIZE*6)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds to bignum_montmul_sm2 with x0 in place of x17
+
+#define montmul_sm2(P0,P1,P2)                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2] __LF                  \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x7 __LF                   \
+        mul     x13, x4, x8 __LF                   \
+        umulh   x12, x3, x7 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x0, x12, x14 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x15, x3, x4 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        csetm   x1, lo __LF                        \
+        subs    x0, x8, x7 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        mul     x16, x15, x0 __LF                  \
+        umulh   x0, x15, x0 __LF                   \
+        cinv    x1, x1, lo __LF                    \
+        eor     x16, x16, x1 __LF                  \
+        eor     x0, x0, x1 __LF                    \
+        cmn     x1, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adc     x14, x14, x1 __LF                  \
+        lsl     x16, x11, #32 __LF                 \
+        lsr     x15, x11, #32 __LF                 \
+        subs    x1, x16, x11 __LF                  \
+        sbc     x0, x15, xzr __LF                  \
+        subs    x12, x12, x1 __LF                  \
+        sbcs    x13, x13, x0 __LF                  \
+        sbcs    x14, x14, x16 __LF                 \
+        sbc     x11, x11, x15 __LF                 \
+        lsl     x16, x12, #32 __LF                 \
+        lsr     x15, x12, #32 __LF                 \
+        subs    x1, x16, x12 __LF                  \
+        sbc     x0, x15, xzr __LF                  \
+        subs    x13, x13, x1 __LF                  \
+        sbcs    x14, x14, x0 __LF                  \
+        sbcs    x11, x11, x16 __LF                 \
+        sbc     x12, x12, x15 __LF                 \
+        stp     x13, x14, [P0] __LF                \
+        stp     x11, x12, [P0+16] __LF             \
+        mul     x11, x5, x9 __LF                   \
+        mul     x13, x6, x10 __LF                  \
+        umulh   x12, x5, x9 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x6, x10 __LF                  \
+        adcs    x0, x12, x14 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x15, x5, x6 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        csetm   x1, lo __LF                        \
+        subs    x0, x10, x9 __LF                   \
+        cneg    x0, x0, lo __LF                    \
+        mul     x16, x15, x0 __LF                  \
+        umulh   x0, x15, x0 __LF                   \
+        cinv    x1, x1, lo __LF                    \
+        eor     x16, x16, x1 __LF                  \
+        eor     x0, x0, x1 __LF                    \
+        cmn     x1, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adc     x14, x14, x1 __LF                  \
+        subs    x3, x5, x3 __LF                    \
+        sbcs    x4, x6, x4 __LF                    \
+        ngc     x5, xzr __LF                       \
+        cmn     x5, #1 __LF                        \
+        eor     x3, x3, x5 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        eor     x4, x4, x5 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        subs    x7, x7, x9 __LF                    \
+        sbcs    x8, x8, x10 __LF                   \
+        ngc     x9, xzr __LF                       \
+        cmn     x9, #1 __LF                        \
+        eor     x7, x7, x9 __LF                    \
+        adcs    x7, x7, xzr __LF                   \
+        eor     x8, x8, x9 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        eor     x10, x5, x9 __LF                   \
+        ldp     x15, x1, [P0] __LF                 \
+        adds    x15, x11, x15 __LF                 \
+        adcs    x1, x12, x1 __LF                   \
+        ldp     x5, x9, [P0+16] __LF               \
+        adcs    x5, x13, x5 __LF                   \
+        adcs    x9, x14, x9 __LF                   \
+        adc     x2, xzr, xzr __LF                  \
+        mul     x11, x3, x7 __LF                   \
+        mul     x13, x4, x8 __LF                   \
+        umulh   x12, x3, x7 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x0, x12, x14 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x3, x3, x4 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        csetm   x4, lo __LF                        \
+        subs    x0, x8, x7 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        mul     x16, x3, x0 __LF                   \
+        umulh   x0, x3, x0 __LF                    \
+        cinv    x4, x4, lo __LF                    \
+        eor     x16, x16, x4 __LF                  \
+        eor     x0, x0, x4 __LF                    \
+        cmn     x4, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adc     x14, x14, x4 __LF                  \
+        cmn     x10, #1 __LF                       \
+        eor     x11, x11, x10 __LF                 \
+        adcs    x11, x11, x15 __LF                 \
+        eor     x12, x12, x10 __LF                 \
+        adcs    x12, x12, x1 __LF                  \
+        eor     x13, x13, x10 __LF                 \
+        adcs    x13, x13, x5 __LF                  \
+        eor     x14, x14, x10 __LF                 \
+        adcs    x14, x14, x9 __LF                  \
+        adcs    x3, x2, x10 __LF                   \
+        adcs    x4, x10, xzr __LF                  \
+        adc     x10, x10, xzr __LF                 \
+        adds    x13, x13, x15 __LF                 \
+        adcs    x14, x14, x1 __LF                  \
+        adcs    x3, x3, x5 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adc     x10, x10, x2 __LF                  \
+        lsl     x16, x11, #32 __LF                 \
+        lsr     x15, x11, #32 __LF                 \
+        subs    x1, x16, x11 __LF                  \
+        sbc     x0, x15, xzr __LF                  \
+        subs    x12, x12, x1 __LF                  \
+        sbcs    x13, x13, x0 __LF                  \
+        sbcs    x14, x14, x16 __LF                 \
+        sbc     x11, x11, x15 __LF                 \
+        lsl     x16, x12, #32 __LF                 \
+        lsr     x15, x12, #32 __LF                 \
+        subs    x1, x16, x12 __LF                  \
+        sbc     x0, x15, xzr __LF                  \
+        subs    x13, x13, x1 __LF                  \
+        sbcs    x14, x14, x0 __LF                  \
+        sbcs    x11, x11, x16 __LF                 \
+        sbc     x12, x12, x15 __LF                 \
+        adds    x3, x3, x11 __LF                   \
+        adcs    x4, x4, x12 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        add     x2, x10, #1 __LF                   \
+        lsl     x15, x2, #32 __LF                  \
+        sub     x16, x15, x2 __LF                  \
+        adds    x13, x13, x2 __LF                  \
+        adcs    x14, x14, x16 __LF                 \
+        adcs    x3, x3, xzr __LF                   \
+        adcs    x4, x4, x15 __LF                   \
+        csetm   x7, lo __LF                        \
+        adds    x13, x13, x7 __LF                  \
+        and     x16, x7, #0xffffffff00000000 __LF  \
+        adcs    x14, x14, x16 __LF                 \
+        adcs    x3, x3, x7 __LF                    \
+        and     x15, x7, #0xfffffffeffffffff __LF  \
+        adc     x4, x4, x15 __LF                   \
+        stp     x13, x14, [P0] __LF                \
+        stp     x3, x4, [P0+16]
+
+// Corresponds to bignum_montsqr_sm2 with x0 in place of x17
+
+#define montsqr_sm2(P0,P1)                      \
+        ldp     x2, x3, [P1] __LF                  \
+        ldp     x4, x5, [P1+16] __LF               \
+        umull   x15, w2, w2 __LF                   \
+        lsr     x11, x2, #32 __LF                  \
+        umull   x16, w11, w11 __LF                 \
+        umull   x11, w2, w11 __LF                  \
+        adds    x15, x15, x11, lsl #33 __LF        \
+        lsr     x11, x11, #31 __LF                 \
+        adc     x16, x16, x11 __LF                 \
+        umull   x0, w3, w3 __LF                    \
+        lsr     x11, x3, #32 __LF                  \
+        umull   x1, w11, w11 __LF                  \
+        umull   x11, w3, w11 __LF                  \
+        mul     x12, x2, x3 __LF                   \
+        umulh   x13, x2, x3 __LF                   \
+        adds    x0, x0, x11, lsl #33 __LF          \
+        lsr     x11, x11, #31 __LF                 \
+        adc     x1, x1, x11 __LF                   \
+        adds    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        adds    x16, x16, x12 __LF                 \
+        adcs    x0, x0, x13 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        lsl     x12, x15, #32 __LF                 \
+        lsr     x11, x15, #32 __LF                 \
+        subs    x14, x12, x15 __LF                 \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x16, x16, x14 __LF                 \
+        sbcs    x0, x0, x13 __LF                   \
+        sbcs    x1, x1, x12 __LF                   \
+        sbc     x15, x15, x11 __LF                 \
+        lsl     x12, x16, #32 __LF                 \
+        lsr     x11, x16, #32 __LF                 \
+        subs    x14, x12, x16 __LF                 \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x0, x0, x14 __LF                   \
+        sbcs    x1, x1, x13 __LF                   \
+        sbcs    x15, x15, x12 __LF                 \
+        sbc     x16, x16, x11 __LF                 \
+        mul     x6, x2, x4 __LF                    \
+        mul     x14, x3, x5 __LF                   \
+        umulh   x8, x2, x4 __LF                    \
+        subs    x10, x2, x3 __LF                   \
+        cneg    x10, x10, lo __LF                  \
+        csetm   x13, lo __LF                       \
+        subs    x12, x5, x4 __LF                   \
+        cneg    x12, x12, lo __LF                  \
+        mul     x11, x10, x12 __LF                 \
+        umulh   x12, x10, x12 __LF                 \
+        cinv    x13, x13, lo __LF                  \
+        eor     x11, x11, x13 __LF                 \
+        eor     x12, x12, x13 __LF                 \
+        adds    x7, x6, x8 __LF                    \
+        adc     x8, x8, xzr __LF                   \
+        umulh   x9, x3, x5 __LF                    \
+        adds    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x9 __LF                    \
+        adc     x9, x9, xzr __LF                   \
+        adds    x8, x8, x14 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        cmn     x13, #1 __LF                       \
+        adcs    x7, x7, x11 __LF                   \
+        adcs    x8, x8, x12 __LF                   \
+        adc     x9, x9, x13 __LF                   \
+        adds    x6, x6, x6 __LF                    \
+        adcs    x7, x7, x7 __LF                    \
+        adcs    x8, x8, x8 __LF                    \
+        adcs    x9, x9, x9 __LF                    \
+        adc     x10, xzr, xzr __LF                 \
+        adds    x6, x6, x0 __LF                    \
+        adcs    x7, x7, x1 __LF                    \
+        adcs    x8, x8, x15 __LF                   \
+        adcs    x9, x9, x16 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        lsl     x12, x6, #32 __LF                  \
+        lsr     x11, x6, #32 __LF                  \
+        subs    x14, x12, x6 __LF                  \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x7, x7, x14 __LF                   \
+        sbcs    x8, x8, x13 __LF                   \
+        sbcs    x9, x9, x12 __LF                   \
+        sbc     x14, x6, x11 __LF                  \
+        adds    x10, x10, x14 __LF                 \
+        adc     x6, xzr, xzr __LF                  \
+        lsl     x12, x7, #32 __LF                  \
+        lsr     x11, x7, #32 __LF                  \
+        subs    x14, x12, x7 __LF                  \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x8, x8, x14 __LF                   \
+        sbcs    x9, x9, x13 __LF                   \
+        sbcs    x10, x10, x12 __LF                 \
+        sbc     x14, x7, x11 __LF                  \
+        adds    x6, x6, x14 __LF                   \
+        adc     x7, xzr, xzr __LF                  \
+        mul     x11, x4, x4 __LF                   \
+        adds    x8, x8, x11 __LF                   \
+        mul     x12, x5, x5 __LF                   \
+        umulh   x11, x4, x4 __LF                   \
+        adcs    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        umulh   x12, x5, x5 __LF                   \
+        adcs    x6, x6, x12 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        mul     x11, x4, x5 __LF                   \
+        umulh   x12, x4, x5 __LF                   \
+        adds    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adc     x13, xzr, xzr __LF                 \
+        adds    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        adcs    x6, x6, x13 __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        mov     x11, #-4294967296 __LF             \
+        adds    x5, x8, #1 __LF                    \
+        sbcs    x11, x9, x11 __LF                  \
+        mov     x13, #-4294967297 __LF             \
+        adcs    x12, x10, xzr __LF                 \
+        sbcs    x13, x6, x13 __LF                  \
+        sbcs    xzr, x7, xzr __LF                  \
+        csel    x8, x5, x8, hs __LF                \
+        csel    x9, x11, x9, hs __LF               \
+        csel    x10, x12, x10, hs __LF             \
+        csel    x6, x13, x6, hs __LF               \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x6, [P0+16]
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, cc __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        and     x4, x3, #0xffffffff00000000 __LF   \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x3 __LF                    \
+        and     x4, x3, #0xfffffffeffffffff __LF   \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(sm2_montjadd):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 12 * multiply + 4 * square + 7 * subtract
+
+        montsqr_sm2(z1sq,z_1)
+        montsqr_sm2(z2sq,z_2)
+
+        montmul_sm2(y1a,z_2,y_1)
+        montmul_sm2(y2a,z_1,y_2)
+
+        montmul_sm2(x2a,z1sq,x_2)
+        montmul_sm2(x1a,z2sq,x_1)
+        montmul_sm2(y2a,z1sq,y2a)
+        montmul_sm2(y1a,z2sq,y1a)
+
+        sub_sm2(xd,x2a,x1a)
+        sub_sm2(yd,y2a,y1a)
+
+        montsqr_sm2(zz,xd)
+        montsqr_sm2(ww,yd)
+
+        montmul_sm2(zzx1,zz,x1a)
+        montmul_sm2(zzx2,zz,x2a)
+
+        sub_sm2(resx,ww,zzx1)
+        sub_sm2(t1,zzx2,zzx1)
+
+        montmul_sm2(xd,xd,z_1)
+
+        sub_sm2(resx,resx,zzx2)
+
+        sub_sm2(t2,zzx1,resx)
+
+        montmul_sm2(t1,t1,y1a)
+        montmul_sm2(resz,xd,z_2)
+        montmul_sm2(t2,yd,t2)
+
+        sub_sm2(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
+// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne
+
+        ldp     x4, x5, [z_2]
+        ldp     x6, x7, [z_2+16]
+
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne
+
+        cmp     x13, x12
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        ldp     x8, x9, [resz]
+        csel    x8, x0, x8, lo
+        csel    x9, x1, x9, lo
+        csel    x8, x4, x8, hi
+        csel    x9, x5, x9, hi
+        ldp     x10, x11, [resz+16]
+        csel    x10, x2, x10, lo
+        csel    x11, x3, x11, lo
+        csel    x10, x6, x10, hi
+        csel    x11, x7, x11, hi
+
+        ldp     x12, x13, [x_1]
+        ldp     x0, x1, [resx]
+        csel    x0, x12, x0, lo
+        csel    x1, x13, x1, lo
+        ldp     x12, x13, [x_2]
+        csel    x0, x12, x0, hi
+        csel    x1, x13, x1, hi
+
+        ldp     x12, x13, [x_1+16]
+        ldp     x2, x3, [resx+16]
+        csel    x2, x12, x2, lo
+        csel    x3, x13, x3, lo
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x12, x2, hi
+        csel    x3, x13, x3, hi
+
+        ldp     x12, x13, [y_1]
+        ldp     x4, x5, [resy]
+        csel    x4, x12, x4, lo
+        csel    x5, x13, x5, lo
+        ldp     x12, x13, [y_2]
+        csel    x4, x12, x4, hi
+        csel    x5, x13, x5, hi
+
+        ldp     x12, x13, [y_1+16]
+        ldp     x6, x7, [resy+16]
+        csel    x6, x12, x6, lo
+        csel    x7, x13, x7, lo
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x12, x6, hi
+        csel    x7, x13, x7, hi
+
+// Finally store back the multiplexed values
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd_alt.S
new file mode 100644
index 00000000000..390c203ffe9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd_alt.S
@@ -0,0 +1,548 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x15
+#define input_x x16
+#define input_y x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+#define z_2 input_y, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define x1a sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define z2sq sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define y1a sp, #(NUMSIZE*6)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds to bignum_montmul_sm2_alt except for registers
+
+#define montmul_sm2(P0,P1,P2)                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        lsl     x11, x12, #32 __LF                 \
+        lsr     x6, x12, #32 __LF                  \
+        subs    x8, x11, x12 __LF                  \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x13, x13, x8 __LF                  \
+        sbcs    x14, x14, x7 __LF                  \
+        sbcs    x0, x0, x11 __LF                   \
+        sbc     x12, x12, x6 __LF                  \
+        lsl     x11, x13, #32 __LF                 \
+        lsr     x6, x13, #32 __LF                  \
+        subs    x8, x11, x13 __LF                  \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x14, x14, x8 __LF                  \
+        sbcs    x0, x0, x7 __LF                    \
+        sbcs    x12, x12, x11 __LF                 \
+        sbc     x13, x13, x6 __LF                  \
+        lsl     x11, x14, #32 __LF                 \
+        lsr     x6, x14, #32 __LF                  \
+        subs    x8, x11, x14 __LF                  \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x0, x0, x8 __LF                    \
+        sbcs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, x11 __LF                 \
+        sbc     x14, x14, x6 __LF                  \
+        lsl     x11, x0, #32 __LF                  \
+        lsr     x6, x0, #32 __LF                   \
+        subs    x8, x11, x0 __LF                   \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x12, x12, x8 __LF                  \
+        sbcs    x13, x13, x7 __LF                  \
+        sbcs    x14, x14, x11 __LF                 \
+        sbc     x0, x0, x6 __LF                    \
+        adds    x12, x12, x1 __LF                  \
+        adcs    x13, x13, x3 __LF                  \
+        adcs    x14, x14, x4 __LF                  \
+        adcs    x0, x0, x5 __LF                    \
+        cset    x8, cs __LF                        \
+        mov     x11, #0xffffffff00000000 __LF      \
+        mov     x6, #0xfffffffeffffffff __LF       \
+        adds    x1, x12, #0x1 __LF                 \
+        sbcs    x3, x13, x11 __LF                  \
+        adcs    x4, x14, xzr __LF                  \
+        sbcs    x5, x0, x6 __LF                    \
+        sbcs    xzr, x8, xzr __LF                  \
+        csel    x12, x12, x1, cc __LF              \
+        csel    x13, x13, x3, cc __LF              \
+        csel    x14, x14, x4, cc __LF              \
+        csel    x0, x0, x5, cc __LF                \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16]
+
+// Corresponds to bignum_montsqr_sm2_alt exactly
+
+#define montsqr_sm2(P0,P1)                      \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, cs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        lsl     x4, x8, #32 __LF                   \
+        lsr     x5, x8, #32 __LF                   \
+        subs    x2, x4, x8 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x9, x9, x2 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbc     x8, x8, x5 __LF                    \
+        lsl     x4, x9, #32 __LF                   \
+        lsr     x5, x9, #32 __LF                   \
+        subs    x2, x4, x9 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x10, x10, x2 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x8, x8, x4 __LF                    \
+        sbc     x9, x9, x5 __LF                    \
+        lsl     x4, x10, #32 __LF                  \
+        lsr     x5, x10, #32 __LF                  \
+        subs    x2, x4, x10 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x11, x11, x2 __LF                  \
+        sbcs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbc     x10, x10, x5 __LF                  \
+        lsl     x4, x11, #32 __LF                  \
+        lsr     x5, x11, #32 __LF                  \
+        subs    x2, x4, x11 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x8, x8, x2 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbc     x11, x11, x5 __LF                  \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x2, cs __LF                        \
+        mov     x3, #0xffffffff00000000 __LF       \
+        mov     x5, #0xfffffffeffffffff __LF       \
+        adds    x12, x8, #0x1 __LF                 \
+        sbcs    x13, x9, x3 __LF                   \
+        adcs    x14, x10, xzr __LF                 \
+        sbcs    x7, x11, x5 __LF                   \
+        sbcs    xzr, x2, xzr __LF                  \
+        csel    x8, x8, x12, cc __LF               \
+        csel    x9, x9, x13, cc __LF               \
+        csel    x10, x10, x14, cc __LF             \
+        csel    x11, x11, x7, cc __LF              \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe).
+
+#define amontsqr_sm2(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, cs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        lsl     x4, x8, #32 __LF                   \
+        lsr     x5, x8, #32 __LF                   \
+        subs    x2, x4, x8 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x9, x9, x2 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbc     x8, x8, x5 __LF                    \
+        lsl     x4, x9, #32 __LF                   \
+        lsr     x5, x9, #32 __LF                   \
+        subs    x2, x4, x9 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x10, x10, x2 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x8, x8, x4 __LF                    \
+        sbc     x9, x9, x5 __LF                    \
+        lsl     x4, x10, #32 __LF                  \
+        lsr     x5, x10, #32 __LF                  \
+        subs    x2, x4, x10 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x11, x11, x2 __LF                  \
+        sbcs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbc     x10, x10, x5 __LF                  \
+        lsl     x4, x11, #32 __LF                  \
+        lsr     x5, x11, #32 __LF                  \
+        subs    x2, x4, x11 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x8, x8, x2 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbc     x11, x11, x5 __LF                  \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        csetm   x2, cs __LF                        \
+        subs    x8, x8, x2 __LF                    \
+        and     x3, x2, #0xffffffff00000000 __LF   \
+        sbcs    x9, x9, x3 __LF                    \
+        and     x5, x2, #0xfffffffeffffffff __LF   \
+        sbcs    x10, x10, x2 __LF                  \
+        sbc     x11, x11, x5 __LF                  \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, cc __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        and     x4, x3, #0xffffffff00000000 __LF   \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x3 __LF                    \
+        and     x4, x3, #0xfffffffeffffffff __LF   \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(sm2_montjadd_alt):
+
+// Make room on stack for temporary variables
+// Move the input arguments to stable places
+
+        sub     sp, sp, NSPACE
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 12 * multiply + 4 * square + 7 * subtract
+
+        amontsqr_sm2(z1sq,z_1)
+        amontsqr_sm2(z2sq,z_2)
+
+        montmul_sm2(y1a,z_2,y_1)
+        montmul_sm2(y2a,z_1,y_2)
+
+        montmul_sm2(x2a,z1sq,x_2)
+        montmul_sm2(x1a,z2sq,x_1)
+        montmul_sm2(y2a,z1sq,y2a)
+        montmul_sm2(y1a,z2sq,y1a)
+
+        sub_sm2(xd,x2a,x1a)
+        sub_sm2(yd,y2a,y1a)
+
+        amontsqr_sm2(zz,xd)
+        montsqr_sm2(ww,yd)
+
+        montmul_sm2(zzx1,zz,x1a)
+        montmul_sm2(zzx2,zz,x2a)
+
+        sub_sm2(resx,ww,zzx1)
+        sub_sm2(t1,zzx2,zzx1)
+
+        montmul_sm2(xd,xd,z_1)
+
+        sub_sm2(resx,resx,zzx2)
+
+        sub_sm2(t2,zzx1,resx)
+
+        montmul_sm2(t1,t1,y1a)
+        montmul_sm2(resz,xd,z_2)
+        montmul_sm2(t2,yd,t2)
+
+        sub_sm2(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So  "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0)
+// and "LO" <=> ~CF       <=> ~(P1 = 0) /\ P2 = 0
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne
+
+        ldp     x4, x5, [z_2]
+        ldp     x6, x7, [z_2+16]
+
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne
+
+        cmp     x13, x12
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        ldp     x8, x9, [resz]
+        csel    x8, x0, x8, lo
+        csel    x9, x1, x9, lo
+        csel    x8, x4, x8, hi
+        csel    x9, x5, x9, hi
+        ldp     x10, x11, [resz+16]
+        csel    x10, x2, x10, lo
+        csel    x11, x3, x11, lo
+        csel    x10, x6, x10, hi
+        csel    x11, x7, x11, hi
+
+        ldp     x12, x13, [x_1]
+        ldp     x0, x1, [resx]
+        csel    x0, x12, x0, lo
+        csel    x1, x13, x1, lo
+        ldp     x12, x13, [x_2]
+        csel    x0, x12, x0, hi
+        csel    x1, x13, x1, hi
+
+        ldp     x12, x13, [x_1+16]
+        ldp     x2, x3, [resx+16]
+        csel    x2, x12, x2, lo
+        csel    x3, x13, x3, lo
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x12, x2, hi
+        csel    x3, x13, x3, hi
+
+        ldp     x12, x13, [y_1]
+        ldp     x4, x5, [resy]
+        csel    x4, x12, x4, lo
+        csel    x5, x13, x5, lo
+        ldp     x12, x13, [y_2]
+        csel    x4, x12, x4, hi
+        csel    x5, x13, x5, hi
+
+        ldp     x12, x13, [y_1+16]
+        ldp     x6, x7, [resy+16]
+        csel    x6, x12, x6, lo
+        csel    x7, x13, x7, lo
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x12, x6, hi
+        csel    x7, x13, x7, hi
+
+// Finally store back the multiplexed values
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore stack and return
+
+        add     sp, sp, NSPACE
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble.S
new file mode 100644
index 00000000000..e878d939cc1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble.S
@@ -0,0 +1,663 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjdouble
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjdouble)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x19
+#define input_x x20
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z2 sp, #(NUMSIZE*0)
+#define y4 sp, #(NUMSIZE*0)
+
+#define y2 sp, #(NUMSIZE*1)
+
+#define t1 sp, #(NUMSIZE*2)
+
+#define t2 sp, #(NUMSIZE*3)
+#define x2p sp, #(NUMSIZE*3)
+#define dx2 sp, #(NUMSIZE*3)
+
+#define xy2 sp, #(NUMSIZE*4)
+
+#define x4p sp, #(NUMSIZE*5)
+#define d_ sp, #(NUMSIZE*5)
+
+#define NSPACE #(NUMSIZE*6)
+
+// Corresponds to bignum_montmul_sm2 exactly
+
+#define montmul_sm2(P0,P1,P2)                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2] __LF                  \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x7 __LF                   \
+        mul     x13, x4, x8 __LF                   \
+        umulh   x12, x3, x7 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x17, x12, x14 __LF                 \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x17 __LF                 \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x15, x3, x4 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        csetm   x1, lo __LF                        \
+        subs    x17, x8, x7 __LF                   \
+        cneg    x17, x17, lo __LF                  \
+        mul     x16, x15, x17 __LF                 \
+        umulh   x17, x15, x17 __LF                 \
+        cinv    x1, x1, lo __LF                    \
+        eor     x16, x16, x1 __LF                  \
+        eor     x17, x17, x1 __LF                  \
+        cmn     x1, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x17 __LF                 \
+        adc     x14, x14, x1 __LF                  \
+        lsl     x16, x11, #32 __LF                 \
+        lsr     x15, x11, #32 __LF                 \
+        subs    x1, x16, x11 __LF                  \
+        sbc     x17, x15, xzr __LF                 \
+        subs    x12, x12, x1 __LF                  \
+        sbcs    x13, x13, x17 __LF                 \
+        sbcs    x14, x14, x16 __LF                 \
+        sbc     x11, x11, x15 __LF                 \
+        lsl     x16, x12, #32 __LF                 \
+        lsr     x15, x12, #32 __LF                 \
+        subs    x1, x16, x12 __LF                  \
+        sbc     x17, x15, xzr __LF                 \
+        subs    x13, x13, x1 __LF                  \
+        sbcs    x14, x14, x17 __LF                 \
+        sbcs    x11, x11, x16 __LF                 \
+        sbc     x12, x12, x15 __LF                 \
+        stp     x13, x14, [P0] __LF                \
+        stp     x11, x12, [P0+16] __LF             \
+        mul     x11, x5, x9 __LF                   \
+        mul     x13, x6, x10 __LF                  \
+        umulh   x12, x5, x9 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x6, x10 __LF                  \
+        adcs    x17, x12, x14 __LF                 \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x17 __LF                 \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x15, x5, x6 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        csetm   x1, lo __LF                        \
+        subs    x17, x10, x9 __LF                  \
+        cneg    x17, x17, lo __LF                  \
+        mul     x16, x15, x17 __LF                 \
+        umulh   x17, x15, x17 __LF                 \
+        cinv    x1, x1, lo __LF                    \
+        eor     x16, x16, x1 __LF                  \
+        eor     x17, x17, x1 __LF                  \
+        cmn     x1, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x17 __LF                 \
+        adc     x14, x14, x1 __LF                  \
+        subs    x3, x5, x3 __LF                    \
+        sbcs    x4, x6, x4 __LF                    \
+        ngc     x5, xzr __LF                       \
+        cmn     x5, #1 __LF                        \
+        eor     x3, x3, x5 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        eor     x4, x4, x5 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        subs    x7, x7, x9 __LF                    \
+        sbcs    x8, x8, x10 __LF                   \
+        ngc     x9, xzr __LF                       \
+        cmn     x9, #1 __LF                        \
+        eor     x7, x7, x9 __LF                    \
+        adcs    x7, x7, xzr __LF                   \
+        eor     x8, x8, x9 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        eor     x10, x5, x9 __LF                   \
+        ldp     x15, x1, [P0] __LF                 \
+        adds    x15, x11, x15 __LF                 \
+        adcs    x1, x12, x1 __LF                   \
+        ldp     x5, x9, [P0+16] __LF               \
+        adcs    x5, x13, x5 __LF                   \
+        adcs    x9, x14, x9 __LF                   \
+        adc     x2, xzr, xzr __LF                  \
+        mul     x11, x3, x7 __LF                   \
+        mul     x13, x4, x8 __LF                   \
+        umulh   x12, x3, x7 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x17, x12, x14 __LF                 \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x17 __LF                 \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x3, x3, x4 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        csetm   x4, lo __LF                        \
+        subs    x17, x8, x7 __LF                   \
+        cneg    x17, x17, lo __LF                  \
+        mul     x16, x3, x17 __LF                  \
+        umulh   x17, x3, x17 __LF                  \
+        cinv    x4, x4, lo __LF                    \
+        eor     x16, x16, x4 __LF                  \
+        eor     x17, x17, x4 __LF                  \
+        cmn     x4, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x17 __LF                 \
+        adc     x14, x14, x4 __LF                  \
+        cmn     x10, #1 __LF                       \
+        eor     x11, x11, x10 __LF                 \
+        adcs    x11, x11, x15 __LF                 \
+        eor     x12, x12, x10 __LF                 \
+        adcs    x12, x12, x1 __LF                  \
+        eor     x13, x13, x10 __LF                 \
+        adcs    x13, x13, x5 __LF                  \
+        eor     x14, x14, x10 __LF                 \
+        adcs    x14, x14, x9 __LF                  \
+        adcs    x3, x2, x10 __LF                   \
+        adcs    x4, x10, xzr __LF                  \
+        adc     x10, x10, xzr __LF                 \
+        adds    x13, x13, x15 __LF                 \
+        adcs    x14, x14, x1 __LF                  \
+        adcs    x3, x3, x5 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adc     x10, x10, x2 __LF                  \
+        lsl     x16, x11, #32 __LF                 \
+        lsr     x15, x11, #32 __LF                 \
+        subs    x1, x16, x11 __LF                  \
+        sbc     x17, x15, xzr __LF                 \
+        subs    x12, x12, x1 __LF                  \
+        sbcs    x13, x13, x17 __LF                 \
+        sbcs    x14, x14, x16 __LF                 \
+        sbc     x11, x11, x15 __LF                 \
+        lsl     x16, x12, #32 __LF                 \
+        lsr     x15, x12, #32 __LF                 \
+        subs    x1, x16, x12 __LF                  \
+        sbc     x17, x15, xzr __LF                 \
+        subs    x13, x13, x1 __LF                  \
+        sbcs    x14, x14, x17 __LF                 \
+        sbcs    x11, x11, x16 __LF                 \
+        sbc     x12, x12, x15 __LF                 \
+        adds    x3, x3, x11 __LF                   \
+        adcs    x4, x4, x12 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        add     x2, x10, #1 __LF                   \
+        lsl     x15, x2, #32 __LF                  \
+        sub     x16, x15, x2 __LF                  \
+        adds    x13, x13, x2 __LF                  \
+        adcs    x14, x14, x16 __LF                 \
+        adcs    x3, x3, xzr __LF                   \
+        adcs    x4, x4, x15 __LF                   \
+        csetm   x7, lo __LF                        \
+        adds    x13, x13, x7 __LF                  \
+        and     x16, x7, #0xffffffff00000000 __LF  \
+        adcs    x14, x14, x16 __LF                 \
+        adcs    x3, x3, x7 __LF                    \
+        and     x15, x7, #0xfffffffeffffffff __LF  \
+        adc     x4, x4, x15 __LF                   \
+        stp     x13, x14, [P0] __LF                \
+        stp     x3, x4, [P0+16]
+
+// Corresponds to bignum_montsqr_sm2 exactly
+
+#define montsqr_sm2(P0,P1)                      \
+        ldp     x2, x3, [P1] __LF                  \
+        ldp     x4, x5, [P1+16] __LF               \
+        umull   x15, w2, w2 __LF                   \
+        lsr     x11, x2, #32 __LF                  \
+        umull   x16, w11, w11 __LF                 \
+        umull   x11, w2, w11 __LF                  \
+        adds    x15, x15, x11, lsl #33 __LF        \
+        lsr     x11, x11, #31 __LF                 \
+        adc     x16, x16, x11 __LF                 \
+        umull   x17, w3, w3 __LF                   \
+        lsr     x11, x3, #32 __LF                  \
+        umull   x1, w11, w11 __LF                  \
+        umull   x11, w3, w11 __LF                  \
+        mul     x12, x2, x3 __LF                   \
+        umulh   x13, x2, x3 __LF                   \
+        adds    x17, x17, x11, lsl #33 __LF        \
+        lsr     x11, x11, #31 __LF                 \
+        adc     x1, x1, x11 __LF                   \
+        adds    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        adds    x16, x16, x12 __LF                 \
+        adcs    x17, x17, x13 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        lsl     x12, x15, #32 __LF                 \
+        lsr     x11, x15, #32 __LF                 \
+        subs    x14, x12, x15 __LF                 \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x16, x16, x14 __LF                 \
+        sbcs    x17, x17, x13 __LF                 \
+        sbcs    x1, x1, x12 __LF                   \
+        sbc     x15, x15, x11 __LF                 \
+        lsl     x12, x16, #32 __LF                 \
+        lsr     x11, x16, #32 __LF                 \
+        subs    x14, x12, x16 __LF                 \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x17, x17, x14 __LF                 \
+        sbcs    x1, x1, x13 __LF                   \
+        sbcs    x15, x15, x12 __LF                 \
+        sbc     x16, x16, x11 __LF                 \
+        mul     x6, x2, x4 __LF                    \
+        mul     x14, x3, x5 __LF                   \
+        umulh   x8, x2, x4 __LF                    \
+        subs    x10, x2, x3 __LF                   \
+        cneg    x10, x10, lo __LF                  \
+        csetm   x13, lo __LF                       \
+        subs    x12, x5, x4 __LF                   \
+        cneg    x12, x12, lo __LF                  \
+        mul     x11, x10, x12 __LF                 \
+        umulh   x12, x10, x12 __LF                 \
+        cinv    x13, x13, lo __LF                  \
+        eor     x11, x11, x13 __LF                 \
+        eor     x12, x12, x13 __LF                 \
+        adds    x7, x6, x8 __LF                    \
+        adc     x8, x8, xzr __LF                   \
+        umulh   x9, x3, x5 __LF                    \
+        adds    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x9 __LF                    \
+        adc     x9, x9, xzr __LF                   \
+        adds    x8, x8, x14 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        cmn     x13, #1 __LF                       \
+        adcs    x7, x7, x11 __LF                   \
+        adcs    x8, x8, x12 __LF                   \
+        adc     x9, x9, x13 __LF                   \
+        adds    x6, x6, x6 __LF                    \
+        adcs    x7, x7, x7 __LF                    \
+        adcs    x8, x8, x8 __LF                    \
+        adcs    x9, x9, x9 __LF                    \
+        adc     x10, xzr, xzr __LF                 \
+        adds    x6, x6, x17 __LF                   \
+        adcs    x7, x7, x1 __LF                    \
+        adcs    x8, x8, x15 __LF                   \
+        adcs    x9, x9, x16 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        lsl     x12, x6, #32 __LF                  \
+        lsr     x11, x6, #32 __LF                  \
+        subs    x14, x12, x6 __LF                  \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x7, x7, x14 __LF                   \
+        sbcs    x8, x8, x13 __LF                   \
+        sbcs    x9, x9, x12 __LF                   \
+        sbc     x14, x6, x11 __LF                  \
+        adds    x10, x10, x14 __LF                 \
+        adc     x6, xzr, xzr __LF                  \
+        lsl     x12, x7, #32 __LF                  \
+        lsr     x11, x7, #32 __LF                  \
+        subs    x14, x12, x7 __LF                  \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x8, x8, x14 __LF                   \
+        sbcs    x9, x9, x13 __LF                   \
+        sbcs    x10, x10, x12 __LF                 \
+        sbc     x14, x7, x11 __LF                  \
+        adds    x6, x6, x14 __LF                   \
+        adc     x7, xzr, xzr __LF                  \
+        mul     x11, x4, x4 __LF                   \
+        adds    x8, x8, x11 __LF                   \
+        mul     x12, x5, x5 __LF                   \
+        umulh   x11, x4, x4 __LF                   \
+        adcs    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        umulh   x12, x5, x5 __LF                   \
+        adcs    x6, x6, x12 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        mul     x11, x4, x5 __LF                   \
+        umulh   x12, x4, x5 __LF                   \
+        adds    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adc     x13, xzr, xzr __LF                 \
+        adds    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        adcs    x6, x6, x13 __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        mov     x11, #-4294967296 __LF             \
+        adds    x5, x8, #1 __LF                    \
+        sbcs    x11, x9, x11 __LF                  \
+        mov     x13, #-4294967297 __LF             \
+        adcs    x12, x10, xzr __LF                 \
+        sbcs    x13, x6, x13 __LF                  \
+        sbcs    xzr, x7, xzr __LF                  \
+        csel    x8, x5, x8, hs __LF                \
+        csel    x9, x11, x9, hs __LF               \
+        csel    x10, x12, x10, hs __LF             \
+        csel    x6, x13, x6, hs __LF               \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x6, [P0+16]
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, cc __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        and     x4, x3, #0xffffffff00000000 __LF   \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x3 __LF                    \
+        and     x4, x3, #0xfffffffeffffffff __LF   \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Corresponds exactly to bignum_add_sm2
+
+#define add_sm2(P0,P1,P2)                       \
+        ldp     x4, x5, [P1] __LF                  \
+        ldp     x8, x9, [P2] __LF                  \
+        adds    x4, x4, x8 __LF                    \
+        adcs    x5, x5, x9 __LF                    \
+        ldp     x6, x7, [P1+16] __LF               \
+        ldp     x10, x11, [P2+16] __LF             \
+        adcs    x6, x6, x10 __LF                   \
+        adcs    x7, x7, x11 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        adds    x8, x4, #0x1 __LF                  \
+        mov     x9, #0xffffffff00000000 __LF       \
+        sbcs    x9, x5, x9 __LF                    \
+        adcs    x10, x6, xzr __LF                  \
+        mov     x11, #0xfffffffeffffffff __LF      \
+        sbcs    x11, x7, x11 __LF                  \
+        sbcs    x3, x3, xzr __LF                   \
+        csel    x4, x4, x8, cc __LF                \
+        csel    x5, x5, x9, cc __LF                \
+        csel    x6, x6, x10, cc __LF               \
+        csel    x7, x7, x11, cc __LF               \
+        stp     x4, x5, [P0] __LF                  \
+        stp     x6, x7, [P0+16]
+
+// A weak version of add that only guarantees sum in 4 digits
+
+#define weakadd_sm2(P0,P1,P2)                   \
+        ldp     x4, x5, [P1] __LF                  \
+        ldp     x8, x9, [P2] __LF                  \
+        adds    x4, x4, x8 __LF                    \
+        adcs    x5, x5, x9 __LF                    \
+        ldp     x6, x7, [P1+16] __LF               \
+        ldp     x10, x11, [P2+16] __LF             \
+        adcs    x6, x6, x10 __LF                   \
+        adcs    x7, x7, x11 __LF                   \
+        csetm   x2, cs __LF                        \
+        subs    x4, x4, x2 __LF                    \
+        and     x3, x2, #0xffffffff00000000 __LF   \
+        sbcs    x5, x5, x3 __LF                    \
+        and     x1, x2, #0xfffffffeffffffff __LF   \
+        sbcs    x6, x6, x2 __LF                    \
+        sbc     x7, x7, x1 __LF                    \
+        stp     x4, x5, [P0] __LF                  \
+        stp     x6, x7, [P0+16]
+
+// P0 = C * P1 - D * P2 computed as D * (p_sm2 - P2) + C * P1
+// Quotient estimation is done just as q = h + 1 as in bignum_triple_sm2
+// This also applies to the other functions following.
+
+#define cmsub_sm2(P0,C,P1,D,P2)                 \
+        mov     x1, D __LF                         \
+        mov     x2, #-1 __LF                       \
+        ldp     x9, x10, [P2] __LF                 \
+        subs    x9, x2, x9 __LF                    \
+        mov     x3, #0xffffffff00000000 __LF       \
+        sbcs    x10, x3, x10 __LF                  \
+        ldp     x11, x12, [P2+16] __LF             \
+        sbcs    x11, x2, x11 __LF                  \
+        mov     x4, #0xfffffffeffffffff __LF       \
+        sbc     x12, x4, x12 __LF                  \
+        mul     x3, x1, x9 __LF                    \
+        mul     x4, x1, x10 __LF                   \
+        mul     x5, x1, x11 __LF                   \
+        mul     x6, x1, x12 __LF                   \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        umulh   x11, x1, x11 __LF                  \
+        umulh   x7, x1, x12 __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        mov     x1, C __LF                         \
+        ldp     x9, x10, [P1] __LF                 \
+        mul     x8, x9, x1 __LF                    \
+        umulh   x9, x9, x1 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        mul     x8, x10, x1 __LF                   \
+        umulh   x10, x10, x1 __LF                  \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x11, x12, [P1+16] __LF             \
+        mul     x8, x11, x1 __LF                   \
+        umulh   x11, x11, x1 __LF                  \
+        adcs    x5, x5, x8 __LF                    \
+        mul     x8, x12, x1 __LF                   \
+        umulh   x12, x12, x1 __LF                  \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, x12 __LF                   \
+        add     x7, x7, #0x1 __LF                  \
+        lsl     x8, x7, #32 __LF                   \
+        sub     x9, x8, x7 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, x8 __LF                    \
+        csetm   x7, cc __LF                        \
+        adds    x3, x3, x7 __LF                    \
+        and     x9, x7, #0xffffffff00000000 __LF   \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x7 __LF                    \
+        and     x8, x7, #0xfffffffeffffffff __LF   \
+        adc     x6, x6, x8 __LF                    \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// P0 = 4 * P1 - P2, by direct subtraction of P2; the method
+// in bignum_cmul_sm2 etc. for quotient estimation still
+// works when the value to be reduced is negative, as
+// long as it is > -p_sm2, which is the case here.
+
+#define cmsub41_sm2(P0,P1,P2)                   \
+        ldp     x1, x2, [P1] __LF                  \
+        lsl     x0, x1, #2 __LF                    \
+        ldp     x6, x7, [P2] __LF                  \
+        subs    x0, x0, x6 __LF                    \
+        extr    x1, x2, x1, #62 __LF               \
+        sbcs    x1, x1, x7 __LF                    \
+        ldp     x3, x4, [P1+16] __LF               \
+        extr    x2, x3, x2, #62 __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        sbcs    x2, x2, x6 __LF                    \
+        extr    x3, x4, x3, #62 __LF               \
+        sbcs    x3, x3, x7 __LF                    \
+        lsr     x4, x4, #62 __LF                   \
+        sbc     x4, x4, xzr __LF                   \
+        add     x4, x4, #0x1 __LF                  \
+        lsl     x5, x4, #32 __LF                   \
+        sub     x6, x5, x4 __LF                    \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x6 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        adcs    x3, x3, x5 __LF                    \
+        csetm   x4, cc __LF                        \
+        adds    x0, x0, x4 __LF                    \
+        and     x6, x4, #0xffffffff00000000 __LF   \
+        adcs    x1, x1, x6 __LF                    \
+        adcs    x2, x2, x4 __LF                    \
+        and     x5, x4, #0xfffffffeffffffff __LF   \
+        adc     x3, x3, x5 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// P0 = 3 * P1 - 8 * P2, computed as (p_sm2 - P2) << 3 + 3 * P1
+
+#define cmsub38_sm2(P0,P1,P2)                   \
+        mov     x1, 8 __LF                         \
+        mov     x2, #-1 __LF                       \
+        ldp     x9, x10, [P2] __LF                 \
+        subs    x9, x2, x9 __LF                    \
+        mov     x3, #0xffffffff00000000 __LF       \
+        sbcs    x10, x3, x10 __LF                  \
+        ldp     x11, x12, [P2+16] __LF             \
+        sbcs    x11, x2, x11 __LF                  \
+        mov     x4, #0xfffffffeffffffff __LF       \
+        sbc     x12, x4, x12 __LF                  \
+        lsl     x3, x9, #3 __LF                    \
+        extr    x4, x10, x9, #61 __LF              \
+        extr    x5, x11, x10, #61 __LF             \
+        extr    x6, x12, x11, #61 __LF             \
+        lsr     x7, x12, #61 __LF                  \
+        mov     x1, 3 __LF                         \
+        ldp     x9, x10, [P1] __LF                 \
+        mul     x8, x9, x1 __LF                    \
+        umulh   x9, x9, x1 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        mul     x8, x10, x1 __LF                   \
+        umulh   x10, x10, x1 __LF                  \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x11, x12, [P1+16] __LF             \
+        mul     x8, x11, x1 __LF                   \
+        umulh   x11, x11, x1 __LF                  \
+        adcs    x5, x5, x8 __LF                    \
+        mul     x8, x12, x1 __LF                   \
+        umulh   x12, x12, x1 __LF                  \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, x12 __LF                   \
+        add     x7, x7, #0x1 __LF                  \
+        lsl     x8, x7, #32 __LF                   \
+        sub     x9, x8, x7 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, x8 __LF                    \
+        csetm   x7, cc __LF                        \
+        adds    x3, x3, x7 __LF                    \
+        and     x9, x7, #0xffffffff00000000 __LF   \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x7 __LF                    \
+        and     x8, x7, #0xfffffffeffffffff __LF   \
+        adc     x6, x6, x8 __LF                    \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(sm2_montjdouble):
+
+// Save registers and make room on stack for temporary variables
+
+        sub     sp, sp, NSPACE+16
+        stp     x19, x20, [sp, NSPACE]
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+
+// Main code, just a sequence of basic field operations
+
+// z2 = z^2
+// y2 = y^2
+
+        montsqr_sm2(z2,z_1)
+        montsqr_sm2(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        sub_sm2(t2,x_1,z2)
+        weakadd_sm2(t1,x_1,z2)
+        montmul_sm2(x2p,t1,t2)
+
+// t1 = y + z
+// xy2 = x * y^2
+// x4p = x2p^2
+
+        add_sm2(t1,y_1,z_1)
+        montmul_sm2(xy2,x_1,y2)
+        montsqr_sm2(x4p,x2p)
+
+// t1 = (y + z)^2
+
+        montsqr_sm2(t1,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_sm2(d_,12,xy2,9,x4p)
+        sub_sm2(t1,t1,z2)
+
+// y4 = y^4
+
+        montsqr_sm2(y4,y2)
+
+// dx2 = d * x2p
+
+        montmul_sm2(dx2,d_,x2p)
+
+// z_3' = 2 * y * z
+
+        sub_sm2(z_3,t1,y2)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_sm2(x_3,xy2,d_)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_sm2(y_3,dx2,y4)
+
+// Restore registers and stack and return
+
+        ldp     x19, x20, [sp, NSPACE]
+        add     sp, sp, NSPACE+16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble_alt.S
new file mode 100644
index 00000000000..4d29b945fcf
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble_alt.S
@@ -0,0 +1,577 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjdouble_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard ARM ABI: X0 = p3, X1 = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjdouble_alt)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x15
+#define input_x x16
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z2 sp, #(NUMSIZE*0)
+#define y4 sp, #(NUMSIZE*0)
+
+#define y2 sp, #(NUMSIZE*1)
+
+#define t1 sp, #(NUMSIZE*2)
+
+#define t2 sp, #(NUMSIZE*3)
+#define x2p sp, #(NUMSIZE*3)
+#define dx2 sp, #(NUMSIZE*3)
+
+#define xy2 sp, #(NUMSIZE*4)
+
+#define x4p sp, #(NUMSIZE*5)
+#define d sp, #(NUMSIZE*5)
+
+#define NSPACE #(NUMSIZE*6)
+
+// Corresponds to bignum_montmul_sm2_alt except for registers
+
+#define montmul_sm2(P0,P1,P2)                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        lsl     x11, x12, #32 __LF                 \
+        lsr     x6, x12, #32 __LF                  \
+        subs    x8, x11, x12 __LF                  \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x13, x13, x8 __LF                  \
+        sbcs    x14, x14, x7 __LF                  \
+        sbcs    x0, x0, x11 __LF                   \
+        sbc     x12, x12, x6 __LF                  \
+        lsl     x11, x13, #32 __LF                 \
+        lsr     x6, x13, #32 __LF                  \
+        subs    x8, x11, x13 __LF                  \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x14, x14, x8 __LF                  \
+        sbcs    x0, x0, x7 __LF                    \
+        sbcs    x12, x12, x11 __LF                 \
+        sbc     x13, x13, x6 __LF                  \
+        lsl     x11, x14, #32 __LF                 \
+        lsr     x6, x14, #32 __LF                  \
+        subs    x8, x11, x14 __LF                  \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x0, x0, x8 __LF                    \
+        sbcs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, x11 __LF                 \
+        sbc     x14, x14, x6 __LF                  \
+        lsl     x11, x0, #32 __LF                  \
+        lsr     x6, x0, #32 __LF                   \
+        subs    x8, x11, x0 __LF                   \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x12, x12, x8 __LF                  \
+        sbcs    x13, x13, x7 __LF                  \
+        sbcs    x14, x14, x11 __LF                 \
+        sbc     x0, x0, x6 __LF                    \
+        adds    x12, x12, x1 __LF                  \
+        adcs    x13, x13, x3 __LF                  \
+        adcs    x14, x14, x4 __LF                  \
+        adcs    x0, x0, x5 __LF                    \
+        cset    x8, cs __LF                        \
+        mov     x11, #0xffffffff00000000 __LF      \
+        mov     x6, #0xfffffffeffffffff __LF       \
+        adds    x1, x12, #0x1 __LF                 \
+        sbcs    x3, x13, x11 __LF                  \
+        adcs    x4, x14, xzr __LF                  \
+        sbcs    x5, x0, x6 __LF                    \
+        sbcs    xzr, x8, xzr __LF                  \
+        csel    x12, x12, x1, cc __LF              \
+        csel    x13, x13, x3, cc __LF              \
+        csel    x14, x14, x4, cc __LF              \
+        csel    x0, x0, x5, cc __LF                \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16]
+
+// Corresponds to bignum_montsqr_sm2_alt exactly
+
+#define montsqr_sm2(P0,P1)                      \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, cs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        lsl     x4, x8, #32 __LF                   \
+        lsr     x5, x8, #32 __LF                   \
+        subs    x2, x4, x8 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x9, x9, x2 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbc     x8, x8, x5 __LF                    \
+        lsl     x4, x9, #32 __LF                   \
+        lsr     x5, x9, #32 __LF                   \
+        subs    x2, x4, x9 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x10, x10, x2 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x8, x8, x4 __LF                    \
+        sbc     x9, x9, x5 __LF                    \
+        lsl     x4, x10, #32 __LF                  \
+        lsr     x5, x10, #32 __LF                  \
+        subs    x2, x4, x10 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x11, x11, x2 __LF                  \
+        sbcs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbc     x10, x10, x5 __LF                  \
+        lsl     x4, x11, #32 __LF                  \
+        lsr     x5, x11, #32 __LF                  \
+        subs    x2, x4, x11 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x8, x8, x2 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbc     x11, x11, x5 __LF                  \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x2, cs __LF                        \
+        mov     x3, #0xffffffff00000000 __LF       \
+        mov     x5, #0xfffffffeffffffff __LF       \
+        adds    x12, x8, #0x1 __LF                 \
+        sbcs    x13, x9, x3 __LF                   \
+        adcs    x14, x10, xzr __LF                 \
+        sbcs    x7, x11, x5 __LF                   \
+        sbcs    xzr, x2, xzr __LF                  \
+        csel    x8, x8, x12, cc __LF               \
+        csel    x9, x9, x13, cc __LF               \
+        csel    x10, x10, x14, cc __LF             \
+        csel    x11, x11, x7, cc __LF              \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, cc __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        and     x4, x3, #0xffffffff00000000 __LF   \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x3 __LF                    \
+        and     x4, x3, #0xfffffffeffffffff __LF   \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+// Corresponds exactly to bignum_add_sm2
+
+#define add_sm2(P0,P1,P2)                       \
+        ldp     x4, x5, [P1] __LF                  \
+        ldp     x8, x9, [P2] __LF                  \
+        adds    x4, x4, x8 __LF                    \
+        adcs    x5, x5, x9 __LF                    \
+        ldp     x6, x7, [P1+16] __LF               \
+        ldp     x10, x11, [P2+16] __LF             \
+        adcs    x6, x6, x10 __LF                   \
+        adcs    x7, x7, x11 __LF                   \
+        adc     x3, xzr, xzr __LF                  \
+        adds    x8, x4, #0x1 __LF                  \
+        mov     x9, #0xffffffff00000000 __LF       \
+        sbcs    x9, x5, x9 __LF                    \
+        adcs    x10, x6, xzr __LF                  \
+        mov     x11, #0xfffffffeffffffff __LF      \
+        sbcs    x11, x7, x11 __LF                  \
+        sbcs    x3, x3, xzr __LF                   \
+        csel    x4, x4, x8, cc __LF                \
+        csel    x5, x5, x9, cc __LF                \
+        csel    x6, x6, x10, cc __LF               \
+        csel    x7, x7, x11, cc __LF               \
+        stp     x4, x5, [P0] __LF                  \
+        stp     x6, x7, [P0+16]
+
+// A weak version of add that only guarantees sum in 4 digits
+
+#define weakadd_sm2(P0,P1,P2)                   \
+        ldp     x4, x5, [P1] __LF                  \
+        ldp     x8, x9, [P2] __LF                  \
+        adds    x4, x4, x8 __LF                    \
+        adcs    x5, x5, x9 __LF                    \
+        ldp     x6, x7, [P1+16] __LF               \
+        ldp     x10, x11, [P2+16] __LF             \
+        adcs    x6, x6, x10 __LF                   \
+        adcs    x7, x7, x11 __LF                   \
+        csetm   x2, cs __LF                        \
+        subs    x4, x4, x2 __LF                    \
+        and     x3, x2, #0xffffffff00000000 __LF   \
+        sbcs    x5, x5, x3 __LF                    \
+        and     x1, x2, #0xfffffffeffffffff __LF   \
+        sbcs    x6, x6, x2 __LF                    \
+        sbc     x7, x7, x1 __LF                    \
+        stp     x4, x5, [P0] __LF                  \
+        stp     x6, x7, [P0+16]
+
+// P0 = C * P1 - D * P2 computed as D * (p_sm2 - P2) + C * P1
+// Quotient estimation is done just as q = h + 1 as in bignum_triple_sm2
+// This also applies to the other functions following.
+
+#define cmsub_sm2(P0,C,P1,D,P2)                 \
+        mov     x1, D __LF                         \
+        mov     x2, #-1 __LF                       \
+        ldp     x9, x10, [P2] __LF                 \
+        subs    x9, x2, x9 __LF                    \
+        mov     x3, #0xffffffff00000000 __LF       \
+        sbcs    x10, x3, x10 __LF                  \
+        ldp     x11, x12, [P2+16] __LF             \
+        sbcs    x11, x2, x11 __LF                  \
+        mov     x4, #0xfffffffeffffffff __LF       \
+        sbc     x12, x4, x12 __LF                  \
+        mul     x3, x1, x9 __LF                    \
+        mul     x4, x1, x10 __LF                   \
+        mul     x5, x1, x11 __LF                   \
+        mul     x6, x1, x12 __LF                   \
+        umulh   x9, x1, x9 __LF                    \
+        umulh   x10, x1, x10 __LF                  \
+        umulh   x11, x1, x11 __LF                  \
+        umulh   x7, x1, x12 __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        mov     x1, C __LF                         \
+        ldp     x9, x10, [P1] __LF                 \
+        mul     x8, x9, x1 __LF                    \
+        umulh   x9, x9, x1 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        mul     x8, x10, x1 __LF                   \
+        umulh   x10, x10, x1 __LF                  \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x11, x12, [P1+16] __LF             \
+        mul     x8, x11, x1 __LF                   \
+        umulh   x11, x11, x1 __LF                  \
+        adcs    x5, x5, x8 __LF                    \
+        mul     x8, x12, x1 __LF                   \
+        umulh   x12, x12, x1 __LF                  \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, x12 __LF                   \
+        add     x7, x7, #0x1 __LF                  \
+        lsl     x8, x7, #32 __LF                   \
+        sub     x9, x8, x7 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, x8 __LF                    \
+        csetm   x7, cc __LF                        \
+        adds    x3, x3, x7 __LF                    \
+        and     x9, x7, #0xffffffff00000000 __LF   \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x7 __LF                    \
+        and     x8, x7, #0xfffffffeffffffff __LF   \
+        adc     x6, x6, x8 __LF                    \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+// P0 = 4 * P1 - P2, by direct subtraction of P2; the method
+// in bignum_cmul_sm2 etc. for quotient estimation still
+// works when the value to be reduced is negative, as
+// long as it is > -p_sm2, which is the case here.
+
+#define cmsub41_sm2(P0,P1,P2)                   \
+        ldp     x1, x2, [P1] __LF                  \
+        lsl     x0, x1, #2 __LF                    \
+        ldp     x6, x7, [P2] __LF                  \
+        subs    x0, x0, x6 __LF                    \
+        extr    x1, x2, x1, #62 __LF               \
+        sbcs    x1, x1, x7 __LF                    \
+        ldp     x3, x4, [P1+16] __LF               \
+        extr    x2, x3, x2, #62 __LF               \
+        ldp     x6, x7, [P2+16] __LF               \
+        sbcs    x2, x2, x6 __LF                    \
+        extr    x3, x4, x3, #62 __LF               \
+        sbcs    x3, x3, x7 __LF                    \
+        lsr     x4, x4, #62 __LF                   \
+        sbc     x4, x4, xzr __LF                   \
+        add     x4, x4, #0x1 __LF                  \
+        lsl     x5, x4, #32 __LF                   \
+        sub     x6, x5, x4 __LF                    \
+        adds    x0, x0, x4 __LF                    \
+        adcs    x1, x1, x6 __LF                    \
+        adcs    x2, x2, xzr __LF                   \
+        adcs    x3, x3, x5 __LF                    \
+        csetm   x4, cc __LF                        \
+        adds    x0, x0, x4 __LF                    \
+        and     x6, x4, #0xffffffff00000000 __LF   \
+        adcs    x1, x1, x6 __LF                    \
+        adcs    x2, x2, x4 __LF                    \
+        and     x5, x4, #0xfffffffeffffffff __LF   \
+        adc     x3, x3, x5 __LF                    \
+        stp     x0, x1, [P0] __LF                  \
+        stp     x2, x3, [P0+16]
+
+// P0 = 3 * P1 - 8 * P2, computed as (p_sm2 - P2) << 3 + 3 * P1
+
+#define cmsub38_sm2(P0,P1,P2)                   \
+        mov     x1, 8 __LF                         \
+        mov     x2, #-1 __LF                       \
+        ldp     x9, x10, [P2] __LF                 \
+        subs    x9, x2, x9 __LF                    \
+        mov     x3, #0xffffffff00000000 __LF       \
+        sbcs    x10, x3, x10 __LF                  \
+        ldp     x11, x12, [P2+16] __LF             \
+        sbcs    x11, x2, x11 __LF                  \
+        mov     x4, #0xfffffffeffffffff __LF       \
+        sbc     x12, x4, x12 __LF                  \
+        lsl     x3, x9, #3 __LF                    \
+        extr    x4, x10, x9, #61 __LF              \
+        extr    x5, x11, x10, #61 __LF             \
+        extr    x6, x12, x11, #61 __LF             \
+        lsr     x7, x12, #61 __LF                  \
+        mov     x1, 3 __LF                         \
+        ldp     x9, x10, [P1] __LF                 \
+        mul     x8, x9, x1 __LF                    \
+        umulh   x9, x9, x1 __LF                    \
+        adds    x3, x3, x8 __LF                    \
+        mul     x8, x10, x1 __LF                   \
+        umulh   x10, x10, x1 __LF                  \
+        adcs    x4, x4, x8 __LF                    \
+        ldp     x11, x12, [P1+16] __LF             \
+        mul     x8, x11, x1 __LF                   \
+        umulh   x11, x11, x1 __LF                  \
+        adcs    x5, x5, x8 __LF                    \
+        mul     x8, x12, x1 __LF                   \
+        umulh   x12, x12, x1 __LF                  \
+        adcs    x6, x6, x8 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x10 __LF                   \
+        adcs    x6, x6, x11 __LF                   \
+        adc     x7, x7, x12 __LF                   \
+        add     x7, x7, #0x1 __LF                  \
+        lsl     x8, x7, #32 __LF                   \
+        sub     x9, x8, x7 __LF                    \
+        adds    x3, x3, x7 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, xzr __LF                   \
+        adcs    x6, x6, x8 __LF                    \
+        csetm   x7, cc __LF                        \
+        adds    x3, x3, x7 __LF                    \
+        and     x9, x7, #0xffffffff00000000 __LF   \
+        adcs    x4, x4, x9 __LF                    \
+        adcs    x5, x5, x7 __LF                    \
+        and     x8, x7, #0xfffffffeffffffff __LF   \
+        adc     x6, x6, x8 __LF                    \
+        stp     x3, x4, [P0] __LF                  \
+        stp     x5, x6, [P0+16]
+
+S2N_BN_SYMBOL(sm2_montjdouble_alt):
+
+// Make room on stack for temporary variables
+
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+
+// Main code, just a sequence of basic field operations
+
+// z2 = z^2
+// y2 = y^2
+
+        montsqr_sm2(z2,z_1)
+        montsqr_sm2(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        sub_sm2(t2,x_1,z2)
+        weakadd_sm2(t1,x_1,z2)
+        montmul_sm2(x2p,t1,t2)
+
+// t1 = y + z
+// xy2 = x * y^2
+// x4p = x2p^2
+
+        add_sm2(t1,y_1,z_1)
+        montmul_sm2(xy2,x_1,y2)
+        montsqr_sm2(x4p,x2p)
+
+// t1 = (y + z)^2
+
+        montsqr_sm2(t1,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_sm2(d,12,xy2,9,x4p)
+        sub_sm2(t1,t1,z2)
+
+// y4 = y^4
+
+        montsqr_sm2(y4,y2)
+
+// dx2 = d * x2p
+
+        montmul_sm2(dx2,d,x2p)
+
+// z_3' = 2 * y * z
+
+        sub_sm2(z_3,t1,y2)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_sm2(x_3,xy2,d)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_sm2(y_3,dx2,y4)
+
+// Restore stack and return
+
+        add     sp, sp, NSPACE
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd.S
new file mode 100644
index 00000000000..9f7c13cf740
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd.S
@@ -0,0 +1,501 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjmixadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjmixadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjmixadd)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x17
+#define input_x x19
+#define input_y x20
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_montmul_sm2 with x0 in place of x17
+
+#define montmul_sm2(P0,P1,P2)                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x5, x6, [P1+16] __LF               \
+        ldp     x7, x8, [P2] __LF                  \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x7 __LF                   \
+        mul     x13, x4, x8 __LF                   \
+        umulh   x12, x3, x7 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x0, x12, x14 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x15, x3, x4 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        csetm   x1, lo __LF                        \
+        subs    x0, x8, x7 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        mul     x16, x15, x0 __LF                  \
+        umulh   x0, x15, x0 __LF                   \
+        cinv    x1, x1, lo __LF                    \
+        eor     x16, x16, x1 __LF                  \
+        eor     x0, x0, x1 __LF                    \
+        cmn     x1, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adc     x14, x14, x1 __LF                  \
+        lsl     x16, x11, #32 __LF                 \
+        lsr     x15, x11, #32 __LF                 \
+        subs    x1, x16, x11 __LF                  \
+        sbc     x0, x15, xzr __LF                  \
+        subs    x12, x12, x1 __LF                  \
+        sbcs    x13, x13, x0 __LF                  \
+        sbcs    x14, x14, x16 __LF                 \
+        sbc     x11, x11, x15 __LF                 \
+        lsl     x16, x12, #32 __LF                 \
+        lsr     x15, x12, #32 __LF                 \
+        subs    x1, x16, x12 __LF                  \
+        sbc     x0, x15, xzr __LF                  \
+        subs    x13, x13, x1 __LF                  \
+        sbcs    x14, x14, x0 __LF                  \
+        sbcs    x11, x11, x16 __LF                 \
+        sbc     x12, x12, x15 __LF                 \
+        stp     x13, x14, [P0] __LF                \
+        stp     x11, x12, [P0+16] __LF             \
+        mul     x11, x5, x9 __LF                   \
+        mul     x13, x6, x10 __LF                  \
+        umulh   x12, x5, x9 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x6, x10 __LF                  \
+        adcs    x0, x12, x14 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x15, x5, x6 __LF                   \
+        cneg    x15, x15, lo __LF                  \
+        csetm   x1, lo __LF                        \
+        subs    x0, x10, x9 __LF                   \
+        cneg    x0, x0, lo __LF                    \
+        mul     x16, x15, x0 __LF                  \
+        umulh   x0, x15, x0 __LF                   \
+        cinv    x1, x1, lo __LF                    \
+        eor     x16, x16, x1 __LF                  \
+        eor     x0, x0, x1 __LF                    \
+        cmn     x1, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adc     x14, x14, x1 __LF                  \
+        subs    x3, x5, x3 __LF                    \
+        sbcs    x4, x6, x4 __LF                    \
+        ngc     x5, xzr __LF                       \
+        cmn     x5, #1 __LF                        \
+        eor     x3, x3, x5 __LF                    \
+        adcs    x3, x3, xzr __LF                   \
+        eor     x4, x4, x5 __LF                    \
+        adcs    x4, x4, xzr __LF                   \
+        subs    x7, x7, x9 __LF                    \
+        sbcs    x8, x8, x10 __LF                   \
+        ngc     x9, xzr __LF                       \
+        cmn     x9, #1 __LF                        \
+        eor     x7, x7, x9 __LF                    \
+        adcs    x7, x7, xzr __LF                   \
+        eor     x8, x8, x9 __LF                    \
+        adcs    x8, x8, xzr __LF                   \
+        eor     x10, x5, x9 __LF                   \
+        ldp     x15, x1, [P0] __LF                 \
+        adds    x15, x11, x15 __LF                 \
+        adcs    x1, x12, x1 __LF                   \
+        ldp     x5, x9, [P0+16] __LF               \
+        adcs    x5, x13, x5 __LF                   \
+        adcs    x9, x14, x9 __LF                   \
+        adc     x2, xzr, xzr __LF                  \
+        mul     x11, x3, x7 __LF                   \
+        mul     x13, x4, x8 __LF                   \
+        umulh   x12, x3, x7 __LF                   \
+        adds    x16, x11, x13 __LF                 \
+        umulh   x14, x4, x8 __LF                   \
+        adcs    x0, x12, x14 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        adds    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adcs    x14, x14, xzr __LF                 \
+        subs    x3, x3, x4 __LF                    \
+        cneg    x3, x3, lo __LF                    \
+        csetm   x4, lo __LF                        \
+        subs    x0, x8, x7 __LF                    \
+        cneg    x0, x0, lo __LF                    \
+        mul     x16, x3, x0 __LF                   \
+        umulh   x0, x3, x0 __LF                    \
+        cinv    x4, x4, lo __LF                    \
+        eor     x16, x16, x4 __LF                  \
+        eor     x0, x0, x4 __LF                    \
+        cmn     x4, #1 __LF                        \
+        adcs    x12, x12, x16 __LF                 \
+        adcs    x13, x13, x0 __LF                  \
+        adc     x14, x14, x4 __LF                  \
+        cmn     x10, #1 __LF                       \
+        eor     x11, x11, x10 __LF                 \
+        adcs    x11, x11, x15 __LF                 \
+        eor     x12, x12, x10 __LF                 \
+        adcs    x12, x12, x1 __LF                  \
+        eor     x13, x13, x10 __LF                 \
+        adcs    x13, x13, x5 __LF                  \
+        eor     x14, x14, x10 __LF                 \
+        adcs    x14, x14, x9 __LF                  \
+        adcs    x3, x2, x10 __LF                   \
+        adcs    x4, x10, xzr __LF                  \
+        adc     x10, x10, xzr __LF                 \
+        adds    x13, x13, x15 __LF                 \
+        adcs    x14, x14, x1 __LF                  \
+        adcs    x3, x3, x5 __LF                    \
+        adcs    x4, x4, x9 __LF                    \
+        adc     x10, x10, x2 __LF                  \
+        lsl     x16, x11, #32 __LF                 \
+        lsr     x15, x11, #32 __LF                 \
+        subs    x1, x16, x11 __LF                  \
+        sbc     x0, x15, xzr __LF                  \
+        subs    x12, x12, x1 __LF                  \
+        sbcs    x13, x13, x0 __LF                  \
+        sbcs    x14, x14, x16 __LF                 \
+        sbc     x11, x11, x15 __LF                 \
+        lsl     x16, x12, #32 __LF                 \
+        lsr     x15, x12, #32 __LF                 \
+        subs    x1, x16, x12 __LF                  \
+        sbc     x0, x15, xzr __LF                  \
+        subs    x13, x13, x1 __LF                  \
+        sbcs    x14, x14, x0 __LF                  \
+        sbcs    x11, x11, x16 __LF                 \
+        sbc     x12, x12, x15 __LF                 \
+        adds    x3, x3, x11 __LF                   \
+        adcs    x4, x4, x12 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        add     x2, x10, #1 __LF                   \
+        lsl     x15, x2, #32 __LF                  \
+        sub     x16, x15, x2 __LF                  \
+        adds    x13, x13, x2 __LF                  \
+        adcs    x14, x14, x16 __LF                 \
+        adcs    x3, x3, xzr __LF                   \
+        adcs    x4, x4, x15 __LF                   \
+        csetm   x7, lo __LF                        \
+        adds    x13, x13, x7 __LF                  \
+        and     x16, x7, #0xffffffff00000000 __LF  \
+        adcs    x14, x14, x16 __LF                 \
+        adcs    x3, x3, x7 __LF                    \
+        and     x15, x7, #0xfffffffeffffffff __LF  \
+        adc     x4, x4, x15 __LF                   \
+        stp     x13, x14, [P0] __LF                \
+        stp     x3, x4, [P0+16]
+
+// Corresponds to bignum_montsqr_sm2 with x0 in place of x17
+
+#define montsqr_sm2(P0,P1)                      \
+        ldp     x2, x3, [P1] __LF                  \
+        ldp     x4, x5, [P1+16] __LF               \
+        umull   x15, w2, w2 __LF                   \
+        lsr     x11, x2, #32 __LF                  \
+        umull   x16, w11, w11 __LF                 \
+        umull   x11, w2, w11 __LF                  \
+        adds    x15, x15, x11, lsl #33 __LF        \
+        lsr     x11, x11, #31 __LF                 \
+        adc     x16, x16, x11 __LF                 \
+        umull   x0, w3, w3 __LF                    \
+        lsr     x11, x3, #32 __LF                  \
+        umull   x1, w11, w11 __LF                  \
+        umull   x11, w3, w11 __LF                  \
+        mul     x12, x2, x3 __LF                   \
+        umulh   x13, x2, x3 __LF                   \
+        adds    x0, x0, x11, lsl #33 __LF          \
+        lsr     x11, x11, #31 __LF                 \
+        adc     x1, x1, x11 __LF                   \
+        adds    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adc     x1, x1, xzr __LF                   \
+        adds    x16, x16, x12 __LF                 \
+        adcs    x0, x0, x13 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        lsl     x12, x15, #32 __LF                 \
+        lsr     x11, x15, #32 __LF                 \
+        subs    x14, x12, x15 __LF                 \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x16, x16, x14 __LF                 \
+        sbcs    x0, x0, x13 __LF                   \
+        sbcs    x1, x1, x12 __LF                   \
+        sbc     x15, x15, x11 __LF                 \
+        lsl     x12, x16, #32 __LF                 \
+        lsr     x11, x16, #32 __LF                 \
+        subs    x14, x12, x16 __LF                 \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x0, x0, x14 __LF                   \
+        sbcs    x1, x1, x13 __LF                   \
+        sbcs    x15, x15, x12 __LF                 \
+        sbc     x16, x16, x11 __LF                 \
+        mul     x6, x2, x4 __LF                    \
+        mul     x14, x3, x5 __LF                   \
+        umulh   x8, x2, x4 __LF                    \
+        subs    x10, x2, x3 __LF                   \
+        cneg    x10, x10, lo __LF                  \
+        csetm   x13, lo __LF                       \
+        subs    x12, x5, x4 __LF                   \
+        cneg    x12, x12, lo __LF                  \
+        mul     x11, x10, x12 __LF                 \
+        umulh   x12, x10, x12 __LF                 \
+        cinv    x13, x13, lo __LF                  \
+        eor     x11, x11, x13 __LF                 \
+        eor     x12, x12, x13 __LF                 \
+        adds    x7, x6, x8 __LF                    \
+        adc     x8, x8, xzr __LF                   \
+        umulh   x9, x3, x5 __LF                    \
+        adds    x7, x7, x14 __LF                   \
+        adcs    x8, x8, x9 __LF                    \
+        adc     x9, x9, xzr __LF                   \
+        adds    x8, x8, x14 __LF                   \
+        adc     x9, x9, xzr __LF                   \
+        cmn     x13, #1 __LF                       \
+        adcs    x7, x7, x11 __LF                   \
+        adcs    x8, x8, x12 __LF                   \
+        adc     x9, x9, x13 __LF                   \
+        adds    x6, x6, x6 __LF                    \
+        adcs    x7, x7, x7 __LF                    \
+        adcs    x8, x8, x8 __LF                    \
+        adcs    x9, x9, x9 __LF                    \
+        adc     x10, xzr, xzr __LF                 \
+        adds    x6, x6, x0 __LF                    \
+        adcs    x7, x7, x1 __LF                    \
+        adcs    x8, x8, x15 __LF                   \
+        adcs    x9, x9, x16 __LF                   \
+        adc     x10, x10, xzr __LF                 \
+        lsl     x12, x6, #32 __LF                  \
+        lsr     x11, x6, #32 __LF                  \
+        subs    x14, x12, x6 __LF                  \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x7, x7, x14 __LF                   \
+        sbcs    x8, x8, x13 __LF                   \
+        sbcs    x9, x9, x12 __LF                   \
+        sbc     x14, x6, x11 __LF                  \
+        adds    x10, x10, x14 __LF                 \
+        adc     x6, xzr, xzr __LF                  \
+        lsl     x12, x7, #32 __LF                  \
+        lsr     x11, x7, #32 __LF                  \
+        subs    x14, x12, x7 __LF                  \
+        sbc     x13, x11, xzr __LF                 \
+        subs    x8, x8, x14 __LF                   \
+        sbcs    x9, x9, x13 __LF                   \
+        sbcs    x10, x10, x12 __LF                 \
+        sbc     x14, x7, x11 __LF                  \
+        adds    x6, x6, x14 __LF                   \
+        adc     x7, xzr, xzr __LF                  \
+        mul     x11, x4, x4 __LF                   \
+        adds    x8, x8, x11 __LF                   \
+        mul     x12, x5, x5 __LF                   \
+        umulh   x11, x4, x4 __LF                   \
+        adcs    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        umulh   x12, x5, x5 __LF                   \
+        adcs    x6, x6, x12 __LF                   \
+        adc     x7, x7, xzr __LF                   \
+        mul     x11, x4, x5 __LF                   \
+        umulh   x12, x4, x5 __LF                   \
+        adds    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adc     x13, xzr, xzr __LF                 \
+        adds    x9, x9, x11 __LF                   \
+        adcs    x10, x10, x12 __LF                 \
+        adcs    x6, x6, x13 __LF                   \
+        adcs    x7, x7, xzr __LF                   \
+        mov     x11, #-4294967296 __LF             \
+        adds    x5, x8, #1 __LF                    \
+        sbcs    x11, x9, x11 __LF                  \
+        mov     x13, #-4294967297 __LF             \
+        adcs    x12, x10, xzr __LF                 \
+        sbcs    x13, x6, x13 __LF                  \
+        sbcs    xzr, x7, xzr __LF                  \
+        csel    x8, x5, x8, hs __LF                \
+        csel    x9, x11, x9, hs __LF               \
+        csel    x10, x12, x10, hs __LF             \
+        csel    x6, x13, x6, hs __LF               \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x6, [P0+16]
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, cc __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        and     x4, x3, #0xffffffff00000000 __LF   \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x3 __LF                    \
+        and     x4, x3, #0xfffffffeffffffff __LF   \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(sm2_montjmixadd):
+
+// Save regs and make room on stack for temporary variables
+
+        stp     x19, x20, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Move the input arguments to stable places
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        montsqr_sm2(zp2,z_1)
+        montmul_sm2(y2a,z_1,y_2)
+
+        montmul_sm2(x2a,zp2,x_2)
+        montmul_sm2(y2a,zp2,y2a)
+
+        sub_sm2(xd,x2a,x_1)
+        sub_sm2(yd,y2a,y_1)
+
+        montsqr_sm2(zz,xd)
+        montsqr_sm2(ww,yd)
+
+        montmul_sm2(zzx1,zz,x_1)
+        montmul_sm2(zzx2,zz,x2a)
+
+        sub_sm2(resx,ww,zzx1)
+        sub_sm2(t1,zzx2,zzx1)
+
+        montmul_sm2(resz,xd,z_1)
+
+        sub_sm2(resx,resx,zzx2)
+
+        sub_sm2(t2,zzx1,resx)
+
+        montmul_sm2(t1,t1,y_1)
+        montmul_sm2(t2,yd,t2)
+
+        sub_sm2(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^256 - p_sm2),
+// hence giving 0 + p2 = p2 for the final result.
+
+        ldp     x0, x1, [resx]
+        ldp     x12, x13, [x_2]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [resx+16]
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+
+        ldp     x4, x5, [resy]
+        ldp     x12, x13, [y_2]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [resy+16]
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+
+        ldp     x8, x9, [resz]
+        mov     x12, #0x0000000000000001
+        mov     x13, #0x00000000ffffffff
+        csel    x8, x8, x12, ne
+        csel    x9, x9, x13, ne
+        ldp     x10, x11, [resz+16]
+        mov     x13, #0x0000000100000000
+        csel    x10, x10, xzr, ne
+        csel    x11, x11, x13, ne
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x19, x20, [sp], 16
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd_alt.S
new file mode 100644
index 00000000000..6c4efc1eb4e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd_alt.S
@@ -0,0 +1,509 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjmixadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjmixadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjmixadd_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+
+#define input_z x15
+#define input_x x16
+#define input_y x17
+
+// Pointer-offset pairs for inputs and outputs
+
+#define x_1 input_x, #0
+#define y_1 input_x, #NUMSIZE
+#define z_1 input_x, #(2*NUMSIZE)
+
+#define x_2 input_y, #0
+#define y_2 input_y, #NUMSIZE
+
+#define x_3 input_z, #0
+#define y_3 input_z, #NUMSIZE
+#define z_3 input_z, #(2*NUMSIZE)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 sp, #(NUMSIZE*0)
+#define ww sp, #(NUMSIZE*0)
+#define resx sp, #(NUMSIZE*0)
+
+#define yd sp, #(NUMSIZE*1)
+#define y2a sp, #(NUMSIZE*1)
+
+#define x2a sp, #(NUMSIZE*2)
+#define zzx2 sp, #(NUMSIZE*2)
+
+#define zz sp, #(NUMSIZE*3)
+#define t1 sp, #(NUMSIZE*3)
+
+#define t2 sp, #(NUMSIZE*4)
+#define zzx1 sp, #(NUMSIZE*4)
+#define resy sp, #(NUMSIZE*4)
+
+#define xd sp, #(NUMSIZE*5)
+#define resz sp, #(NUMSIZE*5)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_montmul_sm2_alt except for registers
+
+#define montmul_sm2(P0,P1,P2)                   \
+        ldp     x3, x4, [P1] __LF                  \
+        ldp     x7, x8, [P2] __LF                  \
+        mul     x12, x3, x7 __LF                   \
+        umulh   x13, x3, x7 __LF                   \
+        mul     x11, x3, x8 __LF                   \
+        umulh   x14, x3, x8 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        ldp     x9, x10, [P2+16] __LF              \
+        mul     x11, x3, x9 __LF                   \
+        umulh   x0, x3, x9 __LF                    \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x3, x10 __LF                  \
+        umulh   x1, x3, x10 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        adc     x1, x1, xzr __LF                   \
+        ldp     x5, x6, [P1+16] __LF               \
+        mul     x11, x4, x7 __LF                   \
+        adds    x13, x13, x11 __LF                 \
+        mul     x11, x4, x8 __LF                   \
+        adcs    x14, x14, x11 __LF                 \
+        mul     x11, x4, x9 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x4, x10 __LF                  \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x3, x4, x10 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        umulh   x11, x4, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        umulh   x11, x4, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        umulh   x11, x4, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        adc     x3, x3, xzr __LF                   \
+        mul     x11, x5, x7 __LF                   \
+        adds    x14, x14, x11 __LF                 \
+        mul     x11, x5, x8 __LF                   \
+        adcs    x0, x0, x11 __LF                   \
+        mul     x11, x5, x9 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x5, x10 __LF                  \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x4, x5, x10 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        umulh   x11, x5, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        umulh   x11, x5, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        umulh   x11, x5, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        adc     x4, x4, xzr __LF                   \
+        mul     x11, x6, x7 __LF                   \
+        adds    x0, x0, x11 __LF                   \
+        mul     x11, x6, x8 __LF                   \
+        adcs    x1, x1, x11 __LF                   \
+        mul     x11, x6, x9 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        mul     x11, x6, x10 __LF                  \
+        adcs    x4, x4, x11 __LF                   \
+        umulh   x5, x6, x10 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        umulh   x11, x6, x7 __LF                   \
+        adds    x1, x1, x11 __LF                   \
+        umulh   x11, x6, x8 __LF                   \
+        adcs    x3, x3, x11 __LF                   \
+        umulh   x11, x6, x9 __LF                   \
+        adcs    x4, x4, x11 __LF                   \
+        adc     x5, x5, xzr __LF                   \
+        lsl     x11, x12, #32 __LF                 \
+        lsr     x6, x12, #32 __LF                  \
+        subs    x8, x11, x12 __LF                  \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x13, x13, x8 __LF                  \
+        sbcs    x14, x14, x7 __LF                  \
+        sbcs    x0, x0, x11 __LF                   \
+        sbc     x12, x12, x6 __LF                  \
+        lsl     x11, x13, #32 __LF                 \
+        lsr     x6, x13, #32 __LF                  \
+        subs    x8, x11, x13 __LF                  \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x14, x14, x8 __LF                  \
+        sbcs    x0, x0, x7 __LF                    \
+        sbcs    x12, x12, x11 __LF                 \
+        sbc     x13, x13, x6 __LF                  \
+        lsl     x11, x14, #32 __LF                 \
+        lsr     x6, x14, #32 __LF                  \
+        subs    x8, x11, x14 __LF                  \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x0, x0, x8 __LF                    \
+        sbcs    x12, x12, x7 __LF                  \
+        sbcs    x13, x13, x11 __LF                 \
+        sbc     x14, x14, x6 __LF                  \
+        lsl     x11, x0, #32 __LF                  \
+        lsr     x6, x0, #32 __LF                   \
+        subs    x8, x11, x0 __LF                   \
+        sbc     x7, x6, xzr __LF                   \
+        subs    x12, x12, x8 __LF                  \
+        sbcs    x13, x13, x7 __LF                  \
+        sbcs    x14, x14, x11 __LF                 \
+        sbc     x0, x0, x6 __LF                    \
+        adds    x12, x12, x1 __LF                  \
+        adcs    x13, x13, x3 __LF                  \
+        adcs    x14, x14, x4 __LF                  \
+        adcs    x0, x0, x5 __LF                    \
+        cset    x8, cs __LF                        \
+        mov     x11, #0xffffffff00000000 __LF      \
+        mov     x6, #0xfffffffeffffffff __LF       \
+        adds    x1, x12, #0x1 __LF                 \
+        sbcs    x3, x13, x11 __LF                  \
+        adcs    x4, x14, xzr __LF                  \
+        sbcs    x5, x0, x6 __LF                    \
+        sbcs    xzr, x8, xzr __LF                  \
+        csel    x12, x12, x1, cc __LF              \
+        csel    x13, x13, x3, cc __LF              \
+        csel    x14, x14, x4, cc __LF              \
+        csel    x0, x0, x5, cc __LF                \
+        stp     x12, x13, [P0] __LF                \
+        stp     x14, x0, [P0+16]
+
+// Corresponds to bignum_montsqr_sm2_alt exactly
+
+#define montsqr_sm2(P0,P1)                      \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, cs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        lsl     x4, x8, #32 __LF                   \
+        lsr     x5, x8, #32 __LF                   \
+        subs    x2, x4, x8 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x9, x9, x2 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbc     x8, x8, x5 __LF                    \
+        lsl     x4, x9, #32 __LF                   \
+        lsr     x5, x9, #32 __LF                   \
+        subs    x2, x4, x9 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x10, x10, x2 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x8, x8, x4 __LF                    \
+        sbc     x9, x9, x5 __LF                    \
+        lsl     x4, x10, #32 __LF                  \
+        lsr     x5, x10, #32 __LF                  \
+        subs    x2, x4, x10 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x11, x11, x2 __LF                  \
+        sbcs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbc     x10, x10, x5 __LF                  \
+        lsl     x4, x11, #32 __LF                  \
+        lsr     x5, x11, #32 __LF                  \
+        subs    x2, x4, x11 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x8, x8, x2 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbc     x11, x11, x5 __LF                  \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        cset    x2, cs __LF                        \
+        mov     x3, #0xffffffff00000000 __LF       \
+        mov     x5, #0xfffffffeffffffff __LF       \
+        adds    x12, x8, #0x1 __LF                 \
+        sbcs    x13, x9, x3 __LF                   \
+        adcs    x14, x10, xzr __LF                 \
+        sbcs    x7, x11, x5 __LF                   \
+        sbcs    xzr, x2, xzr __LF                  \
+        csel    x8, x8, x12, cc __LF               \
+        csel    x9, x9, x13, cc __LF               \
+        csel    x10, x10, x14, cc __LF             \
+        csel    x11, x11, x7, cc __LF              \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe).
+
+#define amontsqr_sm2(P0,P1)                     \
+        ldp     x2, x3, [P1] __LF                  \
+        mul     x9, x2, x3 __LF                    \
+        umulh   x10, x2, x3 __LF                   \
+        ldp     x4, x5, [P1+16] __LF               \
+        mul     x11, x2, x5 __LF                   \
+        umulh   x12, x2, x5 __LF                   \
+        mul     x6, x2, x4 __LF                    \
+        umulh   x7, x2, x4 __LF                    \
+        adds    x10, x10, x6 __LF                  \
+        adcs    x11, x11, x7 __LF                  \
+        mul     x6, x3, x4 __LF                    \
+        umulh   x7, x3, x4 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x11, x11, x6 __LF                  \
+        mul     x13, x4, x5 __LF                   \
+        umulh   x14, x4, x5 __LF                   \
+        adcs    x12, x12, x7 __LF                  \
+        mul     x6, x3, x5 __LF                    \
+        umulh   x7, x3, x5 __LF                    \
+        adc     x7, x7, xzr __LF                   \
+        adds    x12, x12, x6 __LF                  \
+        adcs    x13, x13, x7 __LF                  \
+        adc     x14, x14, xzr __LF                 \
+        adds    x9, x9, x9 __LF                    \
+        adcs    x10, x10, x10 __LF                 \
+        adcs    x11, x11, x11 __LF                 \
+        adcs    x12, x12, x12 __LF                 \
+        adcs    x13, x13, x13 __LF                 \
+        adcs    x14, x14, x14 __LF                 \
+        cset    x7, cs __LF                        \
+        umulh   x6, x2, x2 __LF                    \
+        mul     x8, x2, x2 __LF                    \
+        adds    x9, x9, x6 __LF                    \
+        mul     x6, x3, x3 __LF                    \
+        adcs    x10, x10, x6 __LF                  \
+        umulh   x6, x3, x3 __LF                    \
+        adcs    x11, x11, x6 __LF                  \
+        mul     x6, x4, x4 __LF                    \
+        adcs    x12, x12, x6 __LF                  \
+        umulh   x6, x4, x4 __LF                    \
+        adcs    x13, x13, x6 __LF                  \
+        mul     x6, x5, x5 __LF                    \
+        adcs    x14, x14, x6 __LF                  \
+        umulh   x6, x5, x5 __LF                    \
+        adc     x7, x7, x6 __LF                    \
+        lsl     x4, x8, #32 __LF                   \
+        lsr     x5, x8, #32 __LF                   \
+        subs    x2, x4, x8 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x9, x9, x2 __LF                    \
+        sbcs    x10, x10, x3 __LF                  \
+        sbcs    x11, x11, x4 __LF                  \
+        sbc     x8, x8, x5 __LF                    \
+        lsl     x4, x9, #32 __LF                   \
+        lsr     x5, x9, #32 __LF                   \
+        subs    x2, x4, x9 __LF                    \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x10, x10, x2 __LF                  \
+        sbcs    x11, x11, x3 __LF                  \
+        sbcs    x8, x8, x4 __LF                    \
+        sbc     x9, x9, x5 __LF                    \
+        lsl     x4, x10, #32 __LF                  \
+        lsr     x5, x10, #32 __LF                  \
+        subs    x2, x4, x10 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x11, x11, x2 __LF                  \
+        sbcs    x8, x8, x3 __LF                    \
+        sbcs    x9, x9, x4 __LF                    \
+        sbc     x10, x10, x5 __LF                  \
+        lsl     x4, x11, #32 __LF                  \
+        lsr     x5, x11, #32 __LF                  \
+        subs    x2, x4, x11 __LF                   \
+        sbc     x3, x5, xzr __LF                   \
+        subs    x8, x8, x2 __LF                    \
+        sbcs    x9, x9, x3 __LF                    \
+        sbcs    x10, x10, x4 __LF                  \
+        sbc     x11, x11, x5 __LF                  \
+        adds    x8, x8, x12 __LF                   \
+        adcs    x9, x9, x13 __LF                   \
+        adcs    x10, x10, x14 __LF                 \
+        adcs    x11, x11, x7 __LF                  \
+        csetm   x2, cs __LF                        \
+        subs    x8, x8, x2 __LF                    \
+        and     x3, x2, #0xffffffff00000000 __LF   \
+        sbcs    x9, x9, x3 __LF                    \
+        and     x5, x2, #0xfffffffeffffffff __LF   \
+        sbcs    x10, x10, x2 __LF                  \
+        sbc     x11, x11, x5 __LF                  \
+        stp     x8, x9, [P0] __LF                  \
+        stp     x10, x11, [P0+16]
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        ldp     x5, x6, [P1] __LF                  \
+        ldp     x4, x3, [P2] __LF                  \
+        subs    x5, x5, x4 __LF                    \
+        sbcs    x6, x6, x3 __LF                    \
+        ldp     x7, x8, [P1+16] __LF               \
+        ldp     x4, x3, [P2+16] __LF               \
+        sbcs    x7, x7, x4 __LF                    \
+        sbcs    x8, x8, x3 __LF                    \
+        csetm   x3, cc __LF                        \
+        adds    x5, x5, x3 __LF                    \
+        and     x4, x3, #0xffffffff00000000 __LF   \
+        adcs    x6, x6, x4 __LF                    \
+        adcs    x7, x7, x3 __LF                    \
+        and     x4, x3, #0xfffffffeffffffff __LF   \
+        adc     x8, x8, x4 __LF                    \
+        stp     x5, x6, [P0] __LF                  \
+        stp     x7, x8, [P0+16]
+
+S2N_BN_SYMBOL(sm2_montjmixadd_alt):
+
+// Make room on stack for temporary variables
+// Move the input arguments to stable places
+
+        sub     sp, sp, NSPACE
+
+        mov     input_z, x0
+        mov     input_x, x1
+        mov     input_y, x2
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        amontsqr_sm2(zp2,z_1)
+        montmul_sm2(y2a,z_1,y_2)
+
+        montmul_sm2(x2a,zp2,x_2)
+        montmul_sm2(y2a,zp2,y2a)
+
+        sub_sm2(xd,x2a,x_1)
+        sub_sm2(yd,y2a,y_1)
+
+        amontsqr_sm2(zz,xd)
+        montsqr_sm2(ww,yd)
+
+        montmul_sm2(zzx1,zz,x_1)
+        montmul_sm2(zzx2,zz,x2a)
+
+        sub_sm2(resx,ww,zzx1)
+        sub_sm2(t1,zzx2,zzx1)
+
+        montmul_sm2(resz,xd,z_1)
+
+        sub_sm2(resx,resx,zzx2)
+
+        sub_sm2(t2,zzx1,resx)
+
+        montmul_sm2(t1,t1,y_1)
+        montmul_sm2(t2,yd,t2)
+
+        sub_sm2(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        ldp     x0, x1, [z_1]
+        ldp     x2, x3, [z_1+16]
+        orr     x4, x0, x1
+        orr     x5, x2, x3
+        orr     x4, x4, x5
+        cmp     x4, xzr
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^256 - p_sm2),
+// hence giving 0 + p2 = p2 for the final result.
+
+        ldp     x0, x1, [resx]
+        ldp     x12, x13, [x_2]
+        csel    x0, x0, x12, ne
+        csel    x1, x1, x13, ne
+        ldp     x2, x3, [resx+16]
+        ldp     x12, x13, [x_2+16]
+        csel    x2, x2, x12, ne
+        csel    x3, x3, x13, ne
+
+        ldp     x4, x5, [resy]
+        ldp     x12, x13, [y_2]
+        csel    x4, x4, x12, ne
+        csel    x5, x5, x13, ne
+        ldp     x6, x7, [resy+16]
+        ldp     x12, x13, [y_2+16]
+        csel    x6, x6, x12, ne
+        csel    x7, x7, x13, ne
+
+        ldp     x8, x9, [resz]
+        mov     x12, #0x0000000000000001
+        mov     x13, #0x00000000ffffffff
+        csel    x8, x8, x12, ne
+        csel    x9, x9, x13, ne
+        ldp     x10, x11, [resz+16]
+        mov     x13, #0x0000000100000000
+        csel    x10, x10, xzr, ne
+        csel    x11, x11, x13, ne
+
+        stp     x0, x1, [x_3]
+        stp     x2, x3, [x_3+16]
+        stp     x4, x5, [y_3]
+        stp     x6, x7, [y_3+16]
+        stp     x8, x9, [z_3]
+        stp     x10, x11, [z_3+16]
+
+// Restore stack and return
+
+        add     sp, sp, NSPACE
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul.S
new file mode 100644
index 00000000000..b86545851de
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul.S
@@ -0,0 +1,4498 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery-Jacobian form scalar multiplication for GM/T 0003-2012 curve SM2
+// Input scalar[4], point[12]; output res[12]
+//
+// extern void sm2_montjscalarmul
+//   (uint64_t res[static 12],
+//    uint64_t scalar[static 4],
+//    uint64_t point[static 12]);
+//
+// This function is a variant of its affine point version sm2_scalarmul.
+// Here, input and output points are assumed to be in Jacobian form with
+// their coordinates in the Montgomery domain. Thus, if priming indicates
+// Montgomery form, x' = (2^256 * x) mod p_sm2 etc., each point argument
+// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when
+// z' is nonzero or the point at infinity (group identity) if z' = 0.
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve SM2, returns a representation of n * P. If the result is the
+// point at infinity (either because the input point was or because the
+// scalar was a multiple of p_sm2) then the output is guaranteed to
+// represent the point at infinity, i.e. to have its z coordinate zero.
+//
+// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjscalarmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjscalarmul)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Safe copies of inputs (res lasts the whole code, point not so long)
+// and additional values in variables, with some aliasing
+
+#define res x19
+#define sgn x20
+#define j x20
+#define point x21
+
+// Intermediate variables on the stack.
+
+#define scalarb sp, #(0*NUMSIZE)
+#define acc sp, #(1*NUMSIZE)
+#define tabent sp, #(4*NUMSIZE)
+
+#define tab sp, #(7*NUMSIZE)
+
+#define NSPACE #(31*NUMSIZE)
+
+// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator,
+// which doesn't accept repetitions, assembler macros etc.
+
+#define selectblock(I)                          \
+        cmp     x14, #(1*I) __LF                   \
+        ldp     x12, x13, [x15] __LF               \
+        csel    x0, x12, x0, eq __LF               \
+        csel    x1, x13, x1, eq __LF               \
+        ldp     x12, x13, [x15, #16] __LF          \
+        csel    x2, x12, x2, eq __LF               \
+        csel    x3, x13, x3, eq __LF               \
+        ldp     x12, x13, [x15, #32] __LF          \
+        csel    x4, x12, x4, eq __LF               \
+        csel    x5, x13, x5, eq __LF               \
+        ldp     x12, x13, [x15, #48] __LF          \
+        csel    x6, x12, x6, eq __LF               \
+        csel    x7, x13, x7, eq __LF               \
+        ldp     x12, x13, [x15, #64] __LF          \
+        csel    x8, x12, x8, eq __LF               \
+        csel    x9, x13, x9, eq __LF               \
+        ldp     x12, x13, [x15, #80] __LF          \
+        csel    x10, x12, x10, eq __LF             \
+        csel    x11, x13, x11, eq __LF             \
+        add     x15, x15, #96
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(sm2_montjscalarmul):
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x30, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Preserve the "res" and "point" input arguments. We load and process the
+// scalar immediately so we don't bother preserving that input argument.
+// Also, "point" is only needed early on and so its register gets re-used.
+
+        mov     res, x0
+        mov     point, x2
+
+// Load the digits of group order n_sm2 = [x12;x13;x14;x15]
+
+        movbig(x12, #0x53bb, #0xf409, #0x39d5, #0x4123)
+        movbig(x13, #0x7203, #0xdf6b, #0x21c6, #0x052b)
+        mov     x14, #0xffffffffffffffff
+        mov     x15, #0xfffffffeffffffff
+
+// First, reduce the input scalar mod n_sm2, i.e. conditionally subtract n_sm2
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+
+        subs    x6, x2, x12
+        sbcs    x7, x3, x13
+        sbcs    x8, x4, x14
+        sbcs    x9, x5, x15
+
+        csel    x2, x2, x6, cc
+        csel    x3, x3, x7, cc
+        csel    x4, x4, x8, cc
+        csel    x5, x5, x9, cc
+
+// Now if the top bit of the reduced scalar is set, negate it mod n_sm2,
+// i.e. do n |-> n_sm2 - n. Remember the sign as "sgn" so we can
+// correspondingly negate the point below.
+
+        subs    x6, x12, x2
+        sbcs    x7, x13, x3
+        sbcs    x8, x14, x4
+        sbc     x9, x15, x5
+
+        tst     x5, #0x8000000000000000
+        csel    x2, x2, x6, eq
+        csel    x3, x3, x7, eq
+        csel    x4, x4, x8, eq
+        csel    x5, x5, x9, eq
+        cset    sgn, ne
+
+// In either case then add the recoding constant 0x08888...888 to allow
+// signed digits.
+
+        mov     x6, 0x8888888888888888
+        adds    x2, x2, x6
+        adcs    x3, x3, x6
+        bic     x7, x6, #0xF000000000000000
+        adcs    x4, x4, x6
+        adc     x5, x5, x7
+
+        stp     x2, x3, [scalarb]
+        stp     x4, x5, [scalarb+16]
+
+// Set the tab[0] table entry to the input point = 1 * P, except
+// that we negate it if the top bit of the scalar was set. This
+// negation takes care over the y = 0 case to maintain all the
+// coordinates < p_sm2 throughout, even though triples (x,y,z)
+// with y = 0 can only represent a point on the curve when z = 0
+// and it represents the point at infinity regardless of x and y.
+
+        ldp     x0, x1, [point]
+        stp     x0, x1, [tab]
+        ldp     x2, x3, [point, #16]
+        stp     x2, x3, [tab+16]
+
+        ldp     x4, x5, [point, #32]
+        ldp     x6, x7, [point, #48]
+
+        mov     x0, #0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, #0xffffffff00000000
+        sbcs    x1, x1, x5
+        mov     x2, #0xffffffffffffffff
+        sbcs    x2, x2, x6
+        mov     x3, #0xfffffffeffffffff
+        sbc     x3, x3, x7
+
+        orr     x8, x4, x5
+        orr     x9, x6, x7
+        orr     x8, x8, x9
+        cmp     x8, xzr
+        ccmp    sgn, xzr, #4, ne
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tab+32]
+        stp     x6, x7, [tab+48]
+
+        ldp     x0, x1, [point, #64]
+        stp     x0, x1, [tab+64]
+        ldp     x2, x3, [point, #80]
+        stp     x2, x3, [tab+80]
+
+// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P
+
+        add     x0, tab+96*1
+        add     x1, tab
+        bl      sm2_montjscalarmul_sm2_montjdouble
+
+        add     x0, tab+96*2
+        add     x1, tab+96*1
+        add     x2, tab
+        bl      sm2_montjscalarmul_sm2_montjadd
+
+        add     x0, tab+96*3
+        add     x1, tab+96*1
+        bl      sm2_montjscalarmul_sm2_montjdouble
+
+        add     x0, tab+96*4
+        add     x1, tab+96*3
+        add     x2, tab
+        bl      sm2_montjscalarmul_sm2_montjadd
+
+        add     x0, tab+96*5
+        add     x1, tab+96*2
+        bl      sm2_montjscalarmul_sm2_montjdouble
+
+        add     x0, tab+96*6
+        add     x1, tab+96*5
+        add     x2, tab
+        bl      sm2_montjscalarmul_sm2_montjadd
+
+        add     x0, tab+96*7
+        add     x1, tab+96*3
+        bl      sm2_montjscalarmul_sm2_montjdouble
+
+// Initialize the accumulator as a table entry for top 4 bits (unrecoded)
+
+        ldr     x14, [scalarb+24]
+        lsr     x14, x14, #60
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        add     x15, tab
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+
+        stp     x0, x1, [acc]
+        stp     x2, x3, [acc+16]
+        stp     x4, x5, [acc+32]
+        stp     x6, x7, [acc+48]
+        stp     x8, x9, [acc+64]
+        stp     x10, x11, [acc+80]
+
+        mov     j, #252
+
+// Main loop over size-4 bitfields: double 4 times then add signed digit
+
+sm2_montjscalarmul_mainloop:
+        sub     j, j, #4
+
+        add     x0, acc
+        add     x1, acc
+        bl      sm2_montjscalarmul_sm2_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      sm2_montjscalarmul_sm2_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      sm2_montjscalarmul_sm2_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      sm2_montjscalarmul_sm2_montjdouble
+
+        lsr     x2, j, #6
+        ldr     x14, [sp, x2, lsl #3]   // Exploits scalarb = sp exactly
+        lsr     x14, x14, j
+        and     x14, x14, #15
+
+        subs    x14, x14, #8
+        cset    x16, lo                 // x16 = sign of digit (1 = negative)
+        cneg    x14, x14, lo            // x14 = absolute value of digit
+
+// Conditionally select the table entry tab[i-1] = i * P in constant time
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        add     x15, tab
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+
+// Store it to "tabent" with the y coordinate optionally negated
+// Again, do it carefully to give coordinates < p_sm2 even in
+// the degenerate case y = 0 (when z = 0 for points on the curve).
+
+        stp     x0, x1, [tabent]
+        stp     x2, x3, [tabent+16]
+
+        mov     x0, #0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, #0xffffffff00000000
+        sbcs    x1, x1, x5
+        mov     x2, #0xffffffffffffffff
+        sbcs    x2, x2, x6
+        mov     x3, #0xfffffffeffffffff
+        sbc     x3, x3, x7
+
+        orr     x12, x4, x5
+        orr     x13, x6, x7
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        ccmp    x16, xzr, #4, ne
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tabent+32]
+        stp     x6, x7, [tabent+48]
+        stp     x8, x9, [tabent+64]
+        stp     x10, x11, [tabent+80]
+
+        add     x0, acc
+        add     x1, acc
+        add     x2, tabent
+        bl      sm2_montjscalarmul_sm2_montjadd
+
+        cbnz    j, sm2_montjscalarmul_mainloop
+
+// That's the end of the main loop, and we just need to copy the
+// result in "acc" to the output.
+
+        ldp     x0, x1, [acc]
+        stp     x0, x1, [res]
+        ldp     x0, x1, [acc+16]
+        stp     x0, x1, [res, #16]
+        ldp     x0, x1, [acc+32]
+        stp     x0, x1, [res, #32]
+        ldp     x0, x1, [acc+48]
+        stp     x0, x1, [res, #48]
+        ldp     x0, x1, [acc+64]
+        stp     x0, x1, [res, #64]
+        ldp     x0, x1, [acc+80]
+        stp     x0, x1, [res, #80]
+
+// Restore stack and registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x21, x30, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+sm2_montjscalarmul_sm2_montjadd:
+        stp     x19, x20, [sp, #-0x10]!
+        sub     sp, sp, #0xe0
+        mov     x17, x0
+        mov     x19, x1
+        mov     x20, x2
+        ldp     x2, x3, [x19, #0x40]
+        ldp     x4, x5, [x19, #0x50]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        lsr     x11, x15, #32
+        subs    x14, x12, x15
+        sbc     x13, x11, xzr
+        subs    x16, x16, x14
+        sbcs    x0, x0, x13
+        sbcs    x1, x1, x12
+        sbc     x15, x15, x11
+        lsl     x12, x16, #32
+        lsr     x11, x16, #32
+        subs    x14, x12, x16
+        sbc     x13, x11, xzr
+        subs    x0, x0, x14
+        sbcs    x1, x1, x13
+        sbcs    x15, x15, x12
+        sbc     x16, x16, x11
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, lo
+        csetm   x13, lo
+        subs    x12, x5, x4
+        cneg    x12, x12, lo
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, lo
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        lsr     x11, x6, #32
+        subs    x14, x12, x6
+        sbc     x13, x11, xzr
+        subs    x7, x7, x14
+        sbcs    x8, x8, x13
+        sbcs    x9, x9, x12
+        sbc     x14, x6, x11
+        adds    x10, x10, x14
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        lsr     x11, x7, #32
+        subs    x14, x12, x7
+        sbc     x13, x11, xzr
+        subs    x8, x8, x14
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x12
+        sbc     x14, x7, x11
+        adds    x6, x6, x14
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #-0x100000000
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #-0x100000001
+        adcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, hs
+        csel    x9, x11, x9, hs
+        csel    x10, x12, x10, hs
+        csel    x6, x13, x6, hs
+        stp     x8, x9, [sp]
+        stp     x10, x6, [sp, #0x10]
+        ldp     x2, x3, [x20, #0x40]
+        ldp     x4, x5, [x20, #0x50]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        lsr     x11, x15, #32
+        subs    x14, x12, x15
+        sbc     x13, x11, xzr
+        subs    x16, x16, x14
+        sbcs    x0, x0, x13
+        sbcs    x1, x1, x12
+        sbc     x15, x15, x11
+        lsl     x12, x16, #32
+        lsr     x11, x16, #32
+        subs    x14, x12, x16
+        sbc     x13, x11, xzr
+        subs    x0, x0, x14
+        sbcs    x1, x1, x13
+        sbcs    x15, x15, x12
+        sbc     x16, x16, x11
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, lo
+        csetm   x13, lo
+        subs    x12, x5, x4
+        cneg    x12, x12, lo
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, lo
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        lsr     x11, x6, #32
+        subs    x14, x12, x6
+        sbc     x13, x11, xzr
+        subs    x7, x7, x14
+        sbcs    x8, x8, x13
+        sbcs    x9, x9, x12
+        sbc     x14, x6, x11
+        adds    x10, x10, x14
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        lsr     x11, x7, #32
+        subs    x14, x12, x7
+        sbc     x13, x11, xzr
+        subs    x8, x8, x14
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x12
+        sbc     x14, x7, x11
+        adds    x6, x6, x14
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #-0x100000000
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #-0x100000001
+        adcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, hs
+        csel    x9, x11, x9, hs
+        csel    x10, x12, x10, hs
+        csel    x6, x13, x6, hs
+        stp     x8, x9, [sp, #0xa0]
+        stp     x10, x6, [sp, #0xb0]
+        ldp     x3, x4, [x20, #0x40]
+        ldp     x5, x6, [x20, #0x50]
+        ldp     x7, x8, [x19, #0x20]
+        ldp     x9, x10, [x19, #0x30]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0xc0]
+        stp     x11, x12, [sp, #0xd0]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0xc0]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0xd0]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0xc0]
+        stp     x3, x4, [sp, #0xd0]
+        ldp     x3, x4, [x19, #0x40]
+        ldp     x5, x6, [x19, #0x50]
+        ldp     x7, x8, [x20, #0x20]
+        ldp     x9, x10, [x20, #0x30]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x20]
+        stp     x11, x12, [sp, #0x30]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x20]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x30]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x20]
+        stp     x3, x4, [sp, #0x30]
+        ldp     x3, x4, [sp]
+        ldp     x5, x6, [sp, #0x10]
+        ldp     x7, x8, [x20]
+        ldp     x9, x10, [x20, #0x10]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x40]
+        stp     x11, x12, [sp, #0x50]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x40]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x50]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x40]
+        stp     x3, x4, [sp, #0x50]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x5, x6, [sp, #0xb0]
+        ldp     x7, x8, [x19]
+        ldp     x9, x10, [x19, #0x10]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x80]
+        stp     x11, x12, [sp, #0x90]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x80]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x90]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x80]
+        stp     x3, x4, [sp, #0x90]
+        ldp     x3, x4, [sp]
+        ldp     x5, x6, [sp, #0x10]
+        ldp     x7, x8, [sp, #0x20]
+        ldp     x9, x10, [sp, #0x30]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x20]
+        stp     x11, x12, [sp, #0x30]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x20]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x30]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x20]
+        stp     x3, x4, [sp, #0x30]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x5, x6, [sp, #0xb0]
+        ldp     x7, x8, [sp, #0xc0]
+        ldp     x9, x10, [sp, #0xd0]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0xc0]
+        stp     x11, x12, [sp, #0xd0]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0xc0]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0xd0]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0xc0]
+        stp     x3, x4, [sp, #0xd0]
+        ldp     x5, x6, [sp, #0x40]
+        ldp     x4, x3, [sp, #0x80]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x50]
+        ldp     x4, x3, [sp, #0x90]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0xa0]
+        stp     x7, x8, [sp, #0xb0]
+        ldp     x5, x6, [sp, #0x20]
+        ldp     x4, x3, [sp, #0xc0]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x30]
+        ldp     x4, x3, [sp, #0xd0]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x20]
+        stp     x7, x8, [sp, #0x30]
+        ldp     x2, x3, [sp, #0xa0]
+        ldp     x4, x5, [sp, #0xb0]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        lsr     x11, x15, #32
+        subs    x14, x12, x15
+        sbc     x13, x11, xzr
+        subs    x16, x16, x14
+        sbcs    x0, x0, x13
+        sbcs    x1, x1, x12
+        sbc     x15, x15, x11
+        lsl     x12, x16, #32
+        lsr     x11, x16, #32
+        subs    x14, x12, x16
+        sbc     x13, x11, xzr
+        subs    x0, x0, x14
+        sbcs    x1, x1, x13
+        sbcs    x15, x15, x12
+        sbc     x16, x16, x11
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, lo
+        csetm   x13, lo
+        subs    x12, x5, x4
+        cneg    x12, x12, lo
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, lo
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        lsr     x11, x6, #32
+        subs    x14, x12, x6
+        sbc     x13, x11, xzr
+        subs    x7, x7, x14
+        sbcs    x8, x8, x13
+        sbcs    x9, x9, x12
+        sbc     x14, x6, x11
+        adds    x10, x10, x14
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        lsr     x11, x7, #32
+        subs    x14, x12, x7
+        sbc     x13, x11, xzr
+        subs    x8, x8, x14
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x12
+        sbc     x14, x7, x11
+        adds    x6, x6, x14
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #-0x100000000
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #-0x100000001
+        adcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, hs
+        csel    x9, x11, x9, hs
+        csel    x10, x12, x10, hs
+        csel    x6, x13, x6, hs
+        stp     x8, x9, [sp, #0x60]
+        stp     x10, x6, [sp, #0x70]
+        ldp     x2, x3, [sp, #0x20]
+        ldp     x4, x5, [sp, #0x30]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x0, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x0, x0, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x0, x0, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        lsr     x11, x15, #32
+        subs    x14, x12, x15
+        sbc     x13, x11, xzr
+        subs    x16, x16, x14
+        sbcs    x0, x0, x13
+        sbcs    x1, x1, x12
+        sbc     x15, x15, x11
+        lsl     x12, x16, #32
+        lsr     x11, x16, #32
+        subs    x14, x12, x16
+        sbc     x13, x11, xzr
+        subs    x0, x0, x14
+        sbcs    x1, x1, x13
+        sbcs    x15, x15, x12
+        sbc     x16, x16, x11
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, lo
+        csetm   x13, lo
+        subs    x12, x5, x4
+        cneg    x12, x12, lo
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, lo
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x0
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        lsr     x11, x6, #32
+        subs    x14, x12, x6
+        sbc     x13, x11, xzr
+        subs    x7, x7, x14
+        sbcs    x8, x8, x13
+        sbcs    x9, x9, x12
+        sbc     x14, x6, x11
+        adds    x10, x10, x14
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        lsr     x11, x7, #32
+        subs    x14, x12, x7
+        sbc     x13, x11, xzr
+        subs    x8, x8, x14
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x12
+        sbc     x14, x7, x11
+        adds    x6, x6, x14
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #-0x100000000
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #-0x100000001
+        adcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, hs
+        csel    x9, x11, x9, hs
+        csel    x10, x12, x10, hs
+        csel    x6, x13, x6, hs
+        stp     x8, x9, [sp]
+        stp     x10, x6, [sp, #0x10]
+        ldp     x3, x4, [sp, #0x60]
+        ldp     x5, x6, [sp, #0x70]
+        ldp     x7, x8, [sp, #0x80]
+        ldp     x9, x10, [sp, #0x90]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x80]
+        stp     x11, x12, [sp, #0x90]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x80]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x90]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x80]
+        stp     x3, x4, [sp, #0x90]
+        ldp     x3, x4, [sp, #0x60]
+        ldp     x5, x6, [sp, #0x70]
+        ldp     x7, x8, [sp, #0x40]
+        ldp     x9, x10, [sp, #0x50]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x40]
+        stp     x11, x12, [sp, #0x50]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x40]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x50]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x40]
+        stp     x3, x4, [sp, #0x50]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #0x80]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x10]
+        ldp     x4, x3, [sp, #0x90]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #0x10]
+        ldp     x5, x6, [sp, #0x40]
+        ldp     x4, x3, [sp, #0x80]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x50]
+        ldp     x4, x3, [sp, #0x90]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x60]
+        stp     x7, x8, [sp, #0x70]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x5, x6, [sp, #0xb0]
+        ldp     x7, x8, [x19, #0x40]
+        ldp     x9, x10, [x19, #0x50]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0xa0]
+        stp     x11, x12, [sp, #0xb0]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0xa0]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0xb0]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0xa0]
+        stp     x3, x4, [sp, #0xb0]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #0x40]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x10]
+        ldp     x4, x3, [sp, #0x50]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #0x10]
+        ldp     x5, x6, [sp, #0x80]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x90]
+        ldp     x4, x3, [sp, #0x10]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x80]
+        stp     x7, x8, [sp, #0x90]
+        ldp     x3, x4, [sp, #0x60]
+        ldp     x5, x6, [sp, #0x70]
+        ldp     x7, x8, [sp, #0xc0]
+        ldp     x9, x10, [sp, #0xd0]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x60]
+        stp     x11, x12, [sp, #0x70]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x60]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x70]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x60]
+        stp     x3, x4, [sp, #0x70]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x5, x6, [sp, #0xb0]
+        ldp     x7, x8, [x20, #0x40]
+        ldp     x9, x10, [x20, #0x50]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0xa0]
+        stp     x11, x12, [sp, #0xb0]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0xa0]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0xb0]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0xa0]
+        stp     x3, x4, [sp, #0xb0]
+        ldp     x3, x4, [sp, #0x20]
+        ldp     x5, x6, [sp, #0x30]
+        ldp     x7, x8, [sp, #0x80]
+        ldp     x9, x10, [sp, #0x90]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x80]
+        stp     x11, x12, [sp, #0x90]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x0, x10, x9
+        cneg    x0, x0, lo
+        mul     x16, x15, x0
+        umulh   x0, x15, x0
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x0, x0, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x80]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x90]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x0, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x0
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x0, x8, x7
+        cneg    x0, x0, lo
+        mul     x16, x3, x0
+        umulh   x0, x3, x0
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x0, x0, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x0
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x0, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x0
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x0, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x0
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x80]
+        stp     x3, x4, [sp, #0x90]
+        ldp     x5, x6, [sp, #0x80]
+        ldp     x4, x3, [sp, #0x60]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x90]
+        ldp     x4, x3, [sp, #0x70]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x80]
+        stp     x7, x8, [sp, #0x90]
+        ldp     x0, x1, [x19, #0x40]
+        ldp     x2, x3, [x19, #0x50]
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne
+        ldp     x4, x5, [x20, #0x40]
+        ldp     x6, x7, [x20, #0x50]
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne
+        cmp     x13, x12
+        ldp     x8, x9, [sp, #0xa0]
+        csel    x8, x0, x8, lo
+        csel    x9, x1, x9, lo
+        csel    x8, x4, x8, hi
+        csel    x9, x5, x9, hi
+        ldp     x10, x11, [sp, #0xb0]
+        csel    x10, x2, x10, lo
+        csel    x11, x3, x11, lo
+        csel    x10, x6, x10, hi
+        csel    x11, x7, x11, hi
+        ldp     x12, x13, [x19]
+        ldp     x0, x1, [sp]
+        csel    x0, x12, x0, lo
+        csel    x1, x13, x1, lo
+        ldp     x12, x13, [x20]
+        csel    x0, x12, x0, hi
+        csel    x1, x13, x1, hi
+        ldp     x12, x13, [x19, #0x10]
+        ldp     x2, x3, [sp, #0x10]
+        csel    x2, x12, x2, lo
+        csel    x3, x13, x3, lo
+        ldp     x12, x13, [x20, #0x10]
+        csel    x2, x12, x2, hi
+        csel    x3, x13, x3, hi
+        ldp     x12, x13, [x19, #0x20]
+        ldp     x4, x5, [sp, #0x80]
+        csel    x4, x12, x4, lo
+        csel    x5, x13, x5, lo
+        ldp     x12, x13, [x20, #0x20]
+        csel    x4, x12, x4, hi
+        csel    x5, x13, x5, hi
+        ldp     x12, x13, [x19, #0x30]
+        ldp     x6, x7, [sp, #0x90]
+        csel    x6, x12, x6, lo
+        csel    x7, x13, x7, lo
+        ldp     x12, x13, [x20, #0x30]
+        csel    x6, x12, x6, hi
+        csel    x7, x13, x7, hi
+        stp     x0, x1, [x17]
+        stp     x2, x3, [x17, #0x10]
+        stp     x4, x5, [x17, #0x20]
+        stp     x6, x7, [x17, #0x30]
+        stp     x8, x9, [x17, #0x40]
+        stp     x10, x11, [x17, #0x50]
+        add     sp, sp, #0xe0
+        ldp     x19, x20, [sp], #0x10
+        ret
+
+sm2_montjscalarmul_sm2_montjdouble:
+        sub     sp, sp, #0xd0
+        stp     x19, x20, [sp, #0xc0]
+        mov     x19, x0
+        mov     x20, x1
+        ldp     x2, x3, [x20, #0x40]
+        ldp     x4, x5, [x20, #0x50]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x17, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x17, x17, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x17, x17, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        lsr     x11, x15, #32
+        subs    x14, x12, x15
+        sbc     x13, x11, xzr
+        subs    x16, x16, x14
+        sbcs    x17, x17, x13
+        sbcs    x1, x1, x12
+        sbc     x15, x15, x11
+        lsl     x12, x16, #32
+        lsr     x11, x16, #32
+        subs    x14, x12, x16
+        sbc     x13, x11, xzr
+        subs    x17, x17, x14
+        sbcs    x1, x1, x13
+        sbcs    x15, x15, x12
+        sbc     x16, x16, x11
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, lo
+        csetm   x13, lo
+        subs    x12, x5, x4
+        cneg    x12, x12, lo
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, lo
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x17
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        lsr     x11, x6, #32
+        subs    x14, x12, x6
+        sbc     x13, x11, xzr
+        subs    x7, x7, x14
+        sbcs    x8, x8, x13
+        sbcs    x9, x9, x12
+        sbc     x14, x6, x11
+        adds    x10, x10, x14
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        lsr     x11, x7, #32
+        subs    x14, x12, x7
+        sbc     x13, x11, xzr
+        subs    x8, x8, x14
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x12
+        sbc     x14, x7, x11
+        adds    x6, x6, x14
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #-0x100000000
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #-0x100000001
+        adcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, hs
+        csel    x9, x11, x9, hs
+        csel    x10, x12, x10, hs
+        csel    x6, x13, x6, hs
+        stp     x8, x9, [sp]
+        stp     x10, x6, [sp, #0x10]
+        ldp     x2, x3, [x20, #0x20]
+        ldp     x4, x5, [x20, #0x30]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x17, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x17, x17, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x17, x17, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        lsr     x11, x15, #32
+        subs    x14, x12, x15
+        sbc     x13, x11, xzr
+        subs    x16, x16, x14
+        sbcs    x17, x17, x13
+        sbcs    x1, x1, x12
+        sbc     x15, x15, x11
+        lsl     x12, x16, #32
+        lsr     x11, x16, #32
+        subs    x14, x12, x16
+        sbc     x13, x11, xzr
+        subs    x17, x17, x14
+        sbcs    x1, x1, x13
+        sbcs    x15, x15, x12
+        sbc     x16, x16, x11
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, lo
+        csetm   x13, lo
+        subs    x12, x5, x4
+        cneg    x12, x12, lo
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, lo
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x17
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        lsr     x11, x6, #32
+        subs    x14, x12, x6
+        sbc     x13, x11, xzr
+        subs    x7, x7, x14
+        sbcs    x8, x8, x13
+        sbcs    x9, x9, x12
+        sbc     x14, x6, x11
+        adds    x10, x10, x14
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        lsr     x11, x7, #32
+        subs    x14, x12, x7
+        sbc     x13, x11, xzr
+        subs    x8, x8, x14
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x12
+        sbc     x14, x7, x11
+        adds    x6, x6, x14
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #-0x100000000
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #-0x100000001
+        adcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, hs
+        csel    x9, x11, x9, hs
+        csel    x10, x12, x10, hs
+        csel    x6, x13, x6, hs
+        stp     x8, x9, [sp, #0x20]
+        stp     x10, x6, [sp, #0x30]
+        ldp     x5, x6, [x20]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x20, #0x10]
+        ldp     x4, x3, [sp, #0x10]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x60]
+        stp     x7, x8, [sp, #0x70]
+        ldp     x4, x5, [x20]
+        ldp     x8, x9, [sp]
+        adds    x4, x4, x8
+        adcs    x5, x5, x9
+        ldp     x6, x7, [x20, #0x10]
+        ldp     x10, x11, [sp, #0x10]
+        adcs    x6, x6, x10
+        adcs    x7, x7, x11
+        csetm   x2, hs
+        subs    x4, x4, x2
+        and     x3, x2, #0xffffffff00000000
+        sbcs    x5, x5, x3
+        and     x1, x2, #0xfffffffeffffffff
+        sbcs    x6, x6, x2
+        sbc     x7, x7, x1
+        stp     x4, x5, [sp, #0x40]
+        stp     x6, x7, [sp, #0x50]
+        ldp     x3, x4, [sp, #0x40]
+        ldp     x5, x6, [sp, #0x50]
+        ldp     x7, x8, [sp, #0x60]
+        ldp     x9, x10, [sp, #0x70]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x17, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x17, x8, x7
+        cneg    x17, x17, lo
+        mul     x16, x15, x17
+        umulh   x17, x15, x17
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x17, x17, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x17, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x17
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x17, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x17
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x60]
+        stp     x11, x12, [sp, #0x70]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x17, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x17, x10, x9
+        cneg    x17, x17, lo
+        mul     x16, x15, x17
+        umulh   x17, x15, x17
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x17, x17, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x60]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x70]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x17, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x17, x8, x7
+        cneg    x17, x17, lo
+        mul     x16, x3, x17
+        umulh   x17, x3, x17
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x17, x17, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x17, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x17
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x17, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x17
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x60]
+        stp     x3, x4, [sp, #0x70]
+        ldp     x4, x5, [x20, #0x20]
+        ldp     x8, x9, [x20, #0x40]
+        adds    x4, x4, x8
+        adcs    x5, x5, x9
+        ldp     x6, x7, [x20, #0x30]
+        ldp     x10, x11, [x20, #0x50]
+        adcs    x6, x6, x10
+        adcs    x7, x7, x11
+        adc     x3, xzr, xzr
+        adds    x8, x4, #0x1
+        mov     x9, #-0x100000000
+        sbcs    x9, x5, x9
+        adcs    x10, x6, xzr
+        mov     x11, #-0x100000001
+        sbcs    x11, x7, x11
+        sbcs    x3, x3, xzr
+        csel    x4, x4, x8, lo
+        csel    x5, x5, x9, lo
+        csel    x6, x6, x10, lo
+        csel    x7, x7, x11, lo
+        stp     x4, x5, [sp, #0x40]
+        stp     x6, x7, [sp, #0x50]
+        ldp     x3, x4, [x20]
+        ldp     x5, x6, [x20, #0x10]
+        ldp     x7, x8, [sp, #0x20]
+        ldp     x9, x10, [sp, #0x30]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x17, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x17, x8, x7
+        cneg    x17, x17, lo
+        mul     x16, x15, x17
+        umulh   x17, x15, x17
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x17, x17, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x17, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x17
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x17, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x17
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x80]
+        stp     x11, x12, [sp, #0x90]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x17, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x17, x10, x9
+        cneg    x17, x17, lo
+        mul     x16, x15, x17
+        umulh   x17, x15, x17
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x17, x17, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x80]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x90]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x17, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x17, x8, x7
+        cneg    x17, x17, lo
+        mul     x16, x3, x17
+        umulh   x17, x3, x17
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x17, x17, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x17, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x17
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x17, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x17
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x80]
+        stp     x3, x4, [sp, #0x90]
+        ldp     x2, x3, [sp, #0x60]
+        ldp     x4, x5, [sp, #0x70]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x17, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x17, x17, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x17, x17, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        lsr     x11, x15, #32
+        subs    x14, x12, x15
+        sbc     x13, x11, xzr
+        subs    x16, x16, x14
+        sbcs    x17, x17, x13
+        sbcs    x1, x1, x12
+        sbc     x15, x15, x11
+        lsl     x12, x16, #32
+        lsr     x11, x16, #32
+        subs    x14, x12, x16
+        sbc     x13, x11, xzr
+        subs    x17, x17, x14
+        sbcs    x1, x1, x13
+        sbcs    x15, x15, x12
+        sbc     x16, x16, x11
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, lo
+        csetm   x13, lo
+        subs    x12, x5, x4
+        cneg    x12, x12, lo
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, lo
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x17
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        lsr     x11, x6, #32
+        subs    x14, x12, x6
+        sbc     x13, x11, xzr
+        subs    x7, x7, x14
+        sbcs    x8, x8, x13
+        sbcs    x9, x9, x12
+        sbc     x14, x6, x11
+        adds    x10, x10, x14
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        lsr     x11, x7, #32
+        subs    x14, x12, x7
+        sbc     x13, x11, xzr
+        subs    x8, x8, x14
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x12
+        sbc     x14, x7, x11
+        adds    x6, x6, x14
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #-0x100000000
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #-0x100000001
+        adcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, hs
+        csel    x9, x11, x9, hs
+        csel    x10, x12, x10, hs
+        csel    x6, x13, x6, hs
+        stp     x8, x9, [sp, #0xa0]
+        stp     x10, x6, [sp, #0xb0]
+        ldp     x2, x3, [sp, #0x40]
+        ldp     x4, x5, [sp, #0x50]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x17, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x17, x17, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x17, x17, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        lsr     x11, x15, #32
+        subs    x14, x12, x15
+        sbc     x13, x11, xzr
+        subs    x16, x16, x14
+        sbcs    x17, x17, x13
+        sbcs    x1, x1, x12
+        sbc     x15, x15, x11
+        lsl     x12, x16, #32
+        lsr     x11, x16, #32
+        subs    x14, x12, x16
+        sbc     x13, x11, xzr
+        subs    x17, x17, x14
+        sbcs    x1, x1, x13
+        sbcs    x15, x15, x12
+        sbc     x16, x16, x11
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, lo
+        csetm   x13, lo
+        subs    x12, x5, x4
+        cneg    x12, x12, lo
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, lo
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x17
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        lsr     x11, x6, #32
+        subs    x14, x12, x6
+        sbc     x13, x11, xzr
+        subs    x7, x7, x14
+        sbcs    x8, x8, x13
+        sbcs    x9, x9, x12
+        sbc     x14, x6, x11
+        adds    x10, x10, x14
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        lsr     x11, x7, #32
+        subs    x14, x12, x7
+        sbc     x13, x11, xzr
+        subs    x8, x8, x14
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x12
+        sbc     x14, x7, x11
+        adds    x6, x6, x14
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #-0x100000000
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #-0x100000001
+        adcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, hs
+        csel    x9, x11, x9, hs
+        csel    x10, x12, x10, hs
+        csel    x6, x13, x6, hs
+        stp     x8, x9, [sp, #0x40]
+        stp     x10, x6, [sp, #0x50]
+        mov     x1, #0x9
+        mov     x2, #-0x1
+        ldp     x9, x10, [sp, #0xa0]
+        subs    x9, x2, x9
+        mov     x3, #-0x100000000
+        sbcs    x10, x3, x10
+        ldp     x11, x12, [sp, #0xb0]
+        sbcs    x11, x2, x11
+        mov     x4, #-0x100000001
+        sbc     x12, x4, x12
+        mul     x3, x1, x9
+        mul     x4, x1, x10
+        mul     x5, x1, x11
+        mul     x6, x1, x12
+        umulh   x9, x1, x9
+        umulh   x10, x1, x10
+        umulh   x11, x1, x11
+        umulh   x7, x1, x12
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, xzr
+        mov     x1, #0xc
+        ldp     x9, x10, [sp, #0x80]
+        mul     x8, x9, x1
+        umulh   x9, x9, x1
+        adds    x3, x3, x8
+        mul     x8, x10, x1
+        umulh   x10, x10, x1
+        adcs    x4, x4, x8
+        ldp     x11, x12, [sp, #0x90]
+        mul     x8, x11, x1
+        umulh   x11, x11, x1
+        adcs    x5, x5, x8
+        mul     x8, x12, x1
+        umulh   x12, x12, x1
+        adcs    x6, x6, x8
+        adc     x7, x7, xzr
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, x12
+        add     x7, x7, #0x1
+        lsl     x8, x7, #32
+        sub     x9, x8, x7
+        adds    x3, x3, x7
+        adcs    x4, x4, x9
+        adcs    x5, x5, xzr
+        adcs    x6, x6, x8
+        csetm   x7, lo
+        adds    x3, x3, x7
+        and     x9, x7, #0xffffffff00000000
+        adcs    x4, x4, x9
+        adcs    x5, x5, x7
+        and     x8, x7, #0xfffffffeffffffff
+        adc     x6, x6, x8
+        stp     x3, x4, [sp, #0xa0]
+        stp     x5, x6, [sp, #0xb0]
+        ldp     x5, x6, [sp, #0x40]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x50]
+        ldp     x4, x3, [sp, #0x10]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x40]
+        stp     x7, x8, [sp, #0x50]
+        ldp     x2, x3, [sp, #0x20]
+        ldp     x4, x5, [sp, #0x30]
+        umull   x15, w2, w2
+        lsr     x11, x2, #32
+        umull   x16, w11, w11
+        umull   x11, w2, w11
+        adds    x15, x15, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x16, x16, x11
+        umull   x17, w3, w3
+        lsr     x11, x3, #32
+        umull   x1, w11, w11
+        umull   x11, w3, w11
+        mul     x12, x2, x3
+        umulh   x13, x2, x3
+        adds    x17, x17, x11, lsl #33
+        lsr     x11, x11, #31
+        adc     x1, x1, x11
+        adds    x12, x12, x12
+        adcs    x13, x13, x13
+        adc     x1, x1, xzr
+        adds    x16, x16, x12
+        adcs    x17, x17, x13
+        adc     x1, x1, xzr
+        lsl     x12, x15, #32
+        lsr     x11, x15, #32
+        subs    x14, x12, x15
+        sbc     x13, x11, xzr
+        subs    x16, x16, x14
+        sbcs    x17, x17, x13
+        sbcs    x1, x1, x12
+        sbc     x15, x15, x11
+        lsl     x12, x16, #32
+        lsr     x11, x16, #32
+        subs    x14, x12, x16
+        sbc     x13, x11, xzr
+        subs    x17, x17, x14
+        sbcs    x1, x1, x13
+        sbcs    x15, x15, x12
+        sbc     x16, x16, x11
+        mul     x6, x2, x4
+        mul     x14, x3, x5
+        umulh   x8, x2, x4
+        subs    x10, x2, x3
+        cneg    x10, x10, lo
+        csetm   x13, lo
+        subs    x12, x5, x4
+        cneg    x12, x12, lo
+        mul     x11, x10, x12
+        umulh   x12, x10, x12
+        cinv    x13, x13, lo
+        eor     x11, x11, x13
+        eor     x12, x12, x13
+        adds    x7, x6, x8
+        adc     x8, x8, xzr
+        umulh   x9, x3, x5
+        adds    x7, x7, x14
+        adcs    x8, x8, x9
+        adc     x9, x9, xzr
+        adds    x8, x8, x14
+        adc     x9, x9, xzr
+        cmn     x13, #0x1
+        adcs    x7, x7, x11
+        adcs    x8, x8, x12
+        adc     x9, x9, x13
+        adds    x6, x6, x6
+        adcs    x7, x7, x7
+        adcs    x8, x8, x8
+        adcs    x9, x9, x9
+        adc     x10, xzr, xzr
+        adds    x6, x6, x17
+        adcs    x7, x7, x1
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, xzr
+        lsl     x12, x6, #32
+        lsr     x11, x6, #32
+        subs    x14, x12, x6
+        sbc     x13, x11, xzr
+        subs    x7, x7, x14
+        sbcs    x8, x8, x13
+        sbcs    x9, x9, x12
+        sbc     x14, x6, x11
+        adds    x10, x10, x14
+        adc     x6, xzr, xzr
+        lsl     x12, x7, #32
+        lsr     x11, x7, #32
+        subs    x14, x12, x7
+        sbc     x13, x11, xzr
+        subs    x8, x8, x14
+        sbcs    x9, x9, x13
+        sbcs    x10, x10, x12
+        sbc     x14, x7, x11
+        adds    x6, x6, x14
+        adc     x7, xzr, xzr
+        mul     x11, x4, x4
+        adds    x8, x8, x11
+        mul     x12, x5, x5
+        umulh   x11, x4, x4
+        adcs    x9, x9, x11
+        adcs    x10, x10, x12
+        umulh   x12, x5, x5
+        adcs    x6, x6, x12
+        adc     x7, x7, xzr
+        mul     x11, x4, x5
+        umulh   x12, x4, x5
+        adds    x11, x11, x11
+        adcs    x12, x12, x12
+        adc     x13, xzr, xzr
+        adds    x9, x9, x11
+        adcs    x10, x10, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, xzr
+        mov     x11, #-0x100000000
+        adds    x5, x8, #0x1
+        sbcs    x11, x9, x11
+        mov     x13, #-0x100000001
+        adcs    x12, x10, xzr
+        sbcs    x13, x6, x13
+        sbcs    xzr, x7, xzr
+        csel    x8, x5, x8, hs
+        csel    x9, x11, x9, hs
+        csel    x10, x12, x10, hs
+        csel    x6, x13, x6, hs
+        stp     x8, x9, [sp]
+        stp     x10, x6, [sp, #0x10]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x5, x6, [sp, #0xb0]
+        ldp     x7, x8, [sp, #0x60]
+        ldp     x9, x10, [sp, #0x70]
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x17, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, xzr
+        subs    x15, x3, x4
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x17, x8, x7
+        cneg    x17, x17, lo
+        mul     x16, x15, x17
+        umulh   x17, x15, x17
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x17, x17, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adc     x14, x14, x1
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x17, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x17
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x17, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x17
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        stp     x13, x14, [sp, #0x60]
+        stp     x11, x12, [sp, #0x70]
+        mul     x11, x5, x9
+        mul     x13, x6, x10
+        umulh   x12, x5, x9
+        adds    x16, x11, x13
+        umulh   x14, x6, x10
+        adcs    x17, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, xzr
+        subs    x15, x5, x6
+        cneg    x15, x15, lo
+        csetm   x1, lo
+        subs    x17, x10, x9
+        cneg    x17, x17, lo
+        mul     x16, x15, x17
+        umulh   x17, x15, x17
+        cinv    x1, x1, lo
+        eor     x16, x16, x1
+        eor     x17, x17, x1
+        cmn     x1, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adc     x14, x14, x1
+        subs    x3, x5, x3
+        sbcs    x4, x6, x4
+        ngc     x5, xzr
+        cmn     x5, #0x1
+        eor     x3, x3, x5
+        adcs    x3, x3, xzr
+        eor     x4, x4, x5
+        adcs    x4, x4, xzr
+        subs    x7, x7, x9
+        sbcs    x8, x8, x10
+        ngc     x9, xzr
+        cmn     x9, #0x1
+        eor     x7, x7, x9
+        adcs    x7, x7, xzr
+        eor     x8, x8, x9
+        adcs    x8, x8, xzr
+        eor     x10, x5, x9
+        ldp     x15, x1, [sp, #0x60]
+        adds    x15, x11, x15
+        adcs    x1, x12, x1
+        ldp     x5, x9, [sp, #0x70]
+        adcs    x5, x13, x5
+        adcs    x9, x14, x9
+        adc     x2, xzr, xzr
+        mul     x11, x3, x7
+        mul     x13, x4, x8
+        umulh   x12, x3, x7
+        adds    x16, x11, x13
+        umulh   x14, x4, x8
+        adcs    x17, x12, x14
+        adcs    x14, x14, xzr
+        adds    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, xzr
+        subs    x3, x3, x4
+        cneg    x3, x3, lo
+        csetm   x4, lo
+        subs    x17, x8, x7
+        cneg    x17, x17, lo
+        mul     x16, x3, x17
+        umulh   x17, x3, x17
+        cinv    x4, x4, lo
+        eor     x16, x16, x4
+        eor     x17, x17, x4
+        cmn     x4, #0x1
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adc     x14, x14, x4
+        cmn     x10, #0x1
+        eor     x11, x11, x10
+        adcs    x11, x11, x15
+        eor     x12, x12, x10
+        adcs    x12, x12, x1
+        eor     x13, x13, x10
+        adcs    x13, x13, x5
+        eor     x14, x14, x10
+        adcs    x14, x14, x9
+        adcs    x3, x2, x10
+        adcs    x4, x10, xzr
+        adc     x10, x10, xzr
+        adds    x13, x13, x15
+        adcs    x14, x14, x1
+        adcs    x3, x3, x5
+        adcs    x4, x4, x9
+        adc     x10, x10, x2
+        lsl     x16, x11, #32
+        lsr     x15, x11, #32
+        subs    x1, x16, x11
+        sbc     x17, x15, xzr
+        subs    x12, x12, x1
+        sbcs    x13, x13, x17
+        sbcs    x14, x14, x16
+        sbc     x11, x11, x15
+        lsl     x16, x12, #32
+        lsr     x15, x12, #32
+        subs    x1, x16, x12
+        sbc     x17, x15, xzr
+        subs    x13, x13, x1
+        sbcs    x14, x14, x17
+        sbcs    x11, x11, x16
+        sbc     x12, x12, x15
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adc     x10, x10, xzr
+        add     x2, x10, #0x1
+        lsl     x15, x2, #32
+        sub     x16, x15, x2
+        adds    x13, x13, x2
+        adcs    x14, x14, x16
+        adcs    x3, x3, xzr
+        adcs    x4, x4, x15
+        csetm   x7, lo
+        adds    x13, x13, x7
+        and     x16, x7, #0xffffffff00000000
+        adcs    x14, x14, x16
+        adcs    x3, x3, x7
+        and     x15, x7, #0xfffffffeffffffff
+        adc     x4, x4, x15
+        stp     x13, x14, [sp, #0x60]
+        stp     x3, x4, [sp, #0x70]
+        ldp     x5, x6, [sp, #0x40]
+        ldp     x4, x3, [sp, #0x20]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x50]
+        ldp     x4, x3, [sp, #0x30]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [x19, #0x40]
+        stp     x7, x8, [x19, #0x50]
+        ldp     x1, x2, [sp, #0x80]
+        lsl     x0, x1, #2
+        ldp     x6, x7, [sp, #0xa0]
+        subs    x0, x0, x6
+        extr    x1, x2, x1, #0x3e
+        sbcs    x1, x1, x7
+        ldp     x3, x4, [sp, #0x90]
+        extr    x2, x3, x2, #0x3e
+        ldp     x6, x7, [sp, #0xb0]
+        sbcs    x2, x2, x6
+        extr    x3, x4, x3, #0x3e
+        sbcs    x3, x3, x7
+        lsr     x4, x4, #62
+        sbc     x4, x4, xzr
+        add     x4, x4, #0x1
+        lsl     x5, x4, #32
+        sub     x6, x5, x4
+        adds    x0, x0, x4
+        adcs    x1, x1, x6
+        adcs    x2, x2, xzr
+        adcs    x3, x3, x5
+        csetm   x4, lo
+        adds    x0, x0, x4
+        and     x6, x4, #0xffffffff00000000
+        adcs    x1, x1, x6
+        adcs    x2, x2, x4
+        and     x5, x4, #0xfffffffeffffffff
+        adc     x3, x3, x5
+        stp     x0, x1, [x19]
+        stp     x2, x3, [x19, #0x10]
+        mov     x1, #0x8
+        mov     x2, #-0x1
+        ldp     x9, x10, [sp]
+        subs    x9, x2, x9
+        mov     x3, #-0x100000000
+        sbcs    x10, x3, x10
+        ldp     x11, x12, [sp, #0x10]
+        sbcs    x11, x2, x11
+        mov     x4, #-0x100000001
+        sbc     x12, x4, x12
+        lsl     x3, x9, #3
+        extr    x4, x10, x9, #0x3d
+        extr    x5, x11, x10, #0x3d
+        extr    x6, x12, x11, #0x3d
+        lsr     x7, x12, #61
+        mov     x1, #0x3
+        ldp     x9, x10, [sp, #0x60]
+        mul     x8, x9, x1
+        umulh   x9, x9, x1
+        adds    x3, x3, x8
+        mul     x8, x10, x1
+        umulh   x10, x10, x1
+        adcs    x4, x4, x8
+        ldp     x11, x12, [sp, #0x70]
+        mul     x8, x11, x1
+        umulh   x11, x11, x1
+        adcs    x5, x5, x8
+        mul     x8, x12, x1
+        umulh   x12, x12, x1
+        adcs    x6, x6, x8
+        adc     x7, x7, xzr
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, x12
+        add     x7, x7, #0x1
+        lsl     x8, x7, #32
+        sub     x9, x8, x7
+        adds    x3, x3, x7
+        adcs    x4, x4, x9
+        adcs    x5, x5, xzr
+        adcs    x6, x6, x8
+        csetm   x7, lo
+        adds    x3, x3, x7
+        and     x9, x7, #0xffffffff00000000
+        adcs    x4, x4, x9
+        adcs    x5, x5, x7
+        and     x8, x7, #0xfffffffeffffffff
+        adc     x6, x6, x8
+        stp     x3, x4, [x19, #0x20]
+        stp     x5, x6, [x19, #0x30]
+        ldp     x19, x20, [sp, #0xc0]
+        add     sp, sp, #0xd0
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul_alt.S
new file mode 100644
index 00000000000..01682044936
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul_alt.S
@@ -0,0 +1,3405 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery-Jacobian form scalar multiplication for GM/T 0003-2012 curve SM2
+// Input scalar[4], point[12]; output res[12]
+//
+// extern void sm2_montjscalarmul_alt
+//   (uint64_t res[static 12],
+//    uint64_t scalar[static 4],
+//    uint64_t point[static 12]);
+//
+// This function is a variant of its affine point version sm2_scalarmul_alt.
+// Here, input and output points are assumed to be in Jacobian form with
+// their coordinates in the Montgomery domain. Thus, if priming indicates
+// Montgomery form, x' = (2^256 * x) mod p_sm2 etc., each point argument
+// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when
+// z' is nonzero or the point at infinity (group identity) if z' = 0.
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve SM2, returns a representation of n * P. If the result is the
+// point at infinity (either because the input point was or because the
+// scalar was a multiple of p_sm2) then the output is guaranteed to
+// represent the point at infinity, i.e. to have its z coordinate zero.
+//
+// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjscalarmul_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjscalarmul_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Safe copies of inputs (res lasts the whole code, point not so long)
+// and additional values in variables, with some aliasing
+
+#define res x19
+#define sgn x20
+#define j x20
+#define point x21
+
+// Intermediate variables on the stack.
+
+#define scalarb sp, #(0*NUMSIZE)
+#define acc sp, #(1*NUMSIZE)
+#define tabent sp, #(4*NUMSIZE)
+
+#define tab sp, #(7*NUMSIZE)
+
+#define NSPACE #(31*NUMSIZE)
+
+// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator,
+// which doesn't accept repetitions, assembler macros etc.
+
+#define selectblock(I)                          \
+        cmp     x14, #(1*I) __LF                   \
+        ldp     x12, x13, [x15] __LF               \
+        csel    x0, x12, x0, eq __LF               \
+        csel    x1, x13, x1, eq __LF               \
+        ldp     x12, x13, [x15, #16] __LF          \
+        csel    x2, x12, x2, eq __LF               \
+        csel    x3, x13, x3, eq __LF               \
+        ldp     x12, x13, [x15, #32] __LF          \
+        csel    x4, x12, x4, eq __LF               \
+        csel    x5, x13, x5, eq __LF               \
+        ldp     x12, x13, [x15, #48] __LF          \
+        csel    x6, x12, x6, eq __LF               \
+        csel    x7, x13, x7, eq __LF               \
+        ldp     x12, x13, [x15, #64] __LF          \
+        csel    x8, x12, x8, eq __LF               \
+        csel    x9, x13, x9, eq __LF               \
+        ldp     x12, x13, [x15, #80] __LF          \
+        csel    x10, x12, x10, eq __LF             \
+        csel    x11, x13, x11, eq __LF             \
+        add     x15, x15, #96
+
+// Loading large constants
+
+#define movbig(nn,n3,n2,n1,n0)                                      \
+        movz    nn, n0 __LF                                            \
+        movk    nn, n1, lsl #16 __LF                                   \
+        movk    nn, n2, lsl #32 __LF                                   \
+        movk    nn, n3, lsl #48
+
+S2N_BN_SYMBOL(sm2_montjscalarmul_alt):
+
+        stp     x19, x20, [sp, #-16]!
+        stp     x21, x30, [sp, #-16]!
+        sub     sp, sp, NSPACE
+
+// Preserve the "res" and "point" input arguments. We load and process the
+// scalar immediately so we don't bother preserving that input argument.
+// Also, "point" is only needed early on and so its register gets re-used.
+
+        mov     res, x0
+        mov     point, x2
+
+// Load the digits of group order n_sm2 = [x12;x13;x14;x15]
+
+        movbig(x12, #0x53bb, #0xf409, #0x39d5, #0x4123)
+        movbig(x13, #0x7203, #0xdf6b, #0x21c6, #0x052b)
+        mov     x14, #0xffffffffffffffff
+        mov     x15, #0xfffffffeffffffff
+
+// First, reduce the input scalar mod n_sm2, i.e. conditionally subtract n_sm2
+
+        ldp     x2, x3, [x1]
+        ldp     x4, x5, [x1, #16]
+
+        subs    x6, x2, x12
+        sbcs    x7, x3, x13
+        sbcs    x8, x4, x14
+        sbcs    x9, x5, x15
+
+        csel    x2, x2, x6, cc
+        csel    x3, x3, x7, cc
+        csel    x4, x4, x8, cc
+        csel    x5, x5, x9, cc
+
+// Now if the top bit of the reduced scalar is set, negate it mod n_sm2,
+// i.e. do n |-> n_sm2 - n. Remember the sign as "sgn" so we can
+// correspondingly negate the point below.
+
+        subs    x6, x12, x2
+        sbcs    x7, x13, x3
+        sbcs    x8, x14, x4
+        sbc     x9, x15, x5
+
+        tst     x5, #0x8000000000000000
+        csel    x2, x2, x6, eq
+        csel    x3, x3, x7, eq
+        csel    x4, x4, x8, eq
+        csel    x5, x5, x9, eq
+        cset    sgn, ne
+
+// In either case then add the recoding constant 0x08888...888 to allow
+// signed digits.
+
+        mov     x6, 0x8888888888888888
+        adds    x2, x2, x6
+        adcs    x3, x3, x6
+        bic     x7, x6, #0xF000000000000000
+        adcs    x4, x4, x6
+        adc     x5, x5, x7
+
+        stp     x2, x3, [scalarb]
+        stp     x4, x5, [scalarb+16]
+
+// Set the tab[0] table entry to the input point = 1 * P, except
+// that we negate it if the top bit of the scalar was set. This
+// negation takes care over the y = 0 case to maintain all the
+// coordinates < p_sm2 throughout, even though triples (x,y,z)
+// with y = 0 can only represent a point on the curve when z = 0
+// and it represents the point at infinity regardless of x and y.
+
+        ldp     x0, x1, [point]
+        stp     x0, x1, [tab]
+        ldp     x2, x3, [point, #16]
+        stp     x2, x3, [tab+16]
+
+        ldp     x4, x5, [point, #32]
+        ldp     x6, x7, [point, #48]
+
+        mov     x0, #0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, #0xffffffff00000000
+        sbcs    x1, x1, x5
+        mov     x2, #0xffffffffffffffff
+        sbcs    x2, x2, x6
+        mov     x3, #0xfffffffeffffffff
+        sbc     x3, x3, x7
+
+        orr     x8, x4, x5
+        orr     x9, x6, x7
+        orr     x8, x8, x9
+        cmp     x8, xzr
+        ccmp    sgn, xzr, #4, ne
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tab+32]
+        stp     x6, x7, [tab+48]
+
+        ldp     x0, x1, [point, #64]
+        stp     x0, x1, [tab+64]
+        ldp     x2, x3, [point, #80]
+        stp     x2, x3, [tab+80]
+
+// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P
+
+        add     x0, tab+96*1
+        add     x1, tab
+        bl      sm2_montjscalarmul_alt_sm2_montjdouble
+
+        add     x0, tab+96*2
+        add     x1, tab+96*1
+        add     x2, tab
+        bl      sm2_montjscalarmul_alt_sm2_montjadd
+
+        add     x0, tab+96*3
+        add     x1, tab+96*1
+        bl      sm2_montjscalarmul_alt_sm2_montjdouble
+
+        add     x0, tab+96*4
+        add     x1, tab+96*3
+        add     x2, tab
+        bl      sm2_montjscalarmul_alt_sm2_montjadd
+
+        add     x0, tab+96*5
+        add     x1, tab+96*2
+        bl      sm2_montjscalarmul_alt_sm2_montjdouble
+
+        add     x0, tab+96*6
+        add     x1, tab+96*5
+        add     x2, tab
+        bl      sm2_montjscalarmul_alt_sm2_montjadd
+
+        add     x0, tab+96*7
+        add     x1, tab+96*3
+        bl      sm2_montjscalarmul_alt_sm2_montjdouble
+
+// Initialize the accumulator as a table entry for top 4 bits (unrecoded)
+
+        ldr     x14, [scalarb+24]
+        lsr     x14, x14, #60
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        add     x15, tab
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+
+        stp     x0, x1, [acc]
+        stp     x2, x3, [acc+16]
+        stp     x4, x5, [acc+32]
+        stp     x6, x7, [acc+48]
+        stp     x8, x9, [acc+64]
+        stp     x10, x11, [acc+80]
+
+        mov     j, #252
+
+// Main loop over size-4 bitfields: double 4 times then add signed digit
+
+sm2_montjscalarmul_alt_mainloop:
+        sub     j, j, #4
+
+        add     x0, acc
+        add     x1, acc
+        bl      sm2_montjscalarmul_alt_sm2_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      sm2_montjscalarmul_alt_sm2_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      sm2_montjscalarmul_alt_sm2_montjdouble
+
+        add     x0, acc
+        add     x1, acc
+        bl      sm2_montjscalarmul_alt_sm2_montjdouble
+
+        lsr     x2, j, #6
+        ldr     x14, [sp, x2, lsl #3]   // Exploits scalarb = sp exactly
+        lsr     x14, x14, j
+        and     x14, x14, #15
+
+        subs    x14, x14, #8
+        cset    x16, lo                 // x16 = sign of digit (1 = negative)
+        cneg    x14, x14, lo            // x14 = absolute value of digit
+
+// Conditionally select the table entry tab[i-1] = i * P in constant time
+
+        mov     x0, xzr
+        mov     x1, xzr
+        mov     x2, xzr
+        mov     x3, xzr
+        mov     x4, xzr
+        mov     x5, xzr
+        mov     x6, xzr
+        mov     x7, xzr
+        mov     x8, xzr
+        mov     x9, xzr
+        mov     x10, xzr
+        mov     x11, xzr
+        add     x15, tab
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+
+// Store it to "tabent" with the y coordinate optionally negated
+// Again, do it carefully to give coordinates < p_sm2 even in
+// the degenerate case y = 0 (when z = 0 for points on the curve).
+
+        stp     x0, x1, [tabent]
+        stp     x2, x3, [tabent+16]
+
+        mov     x0, #0xffffffffffffffff
+        subs    x0, x0, x4
+        mov     x1, #0xffffffff00000000
+        sbcs    x1, x1, x5
+        mov     x2, #0xffffffffffffffff
+        sbcs    x2, x2, x6
+        mov     x3, #0xfffffffeffffffff
+        sbc     x3, x3, x7
+
+        orr     x12, x4, x5
+        orr     x13, x6, x7
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        ccmp    x16, xzr, #4, ne
+        csel    x4, x0, x4, ne
+        csel    x5, x1, x5, ne
+        csel    x6, x2, x6, ne
+        csel    x7, x3, x7, ne
+
+        stp     x4, x5, [tabent+32]
+        stp     x6, x7, [tabent+48]
+        stp     x8, x9, [tabent+64]
+        stp     x10, x11, [tabent+80]
+
+        add     x0, acc
+        add     x1, acc
+        add     x2, tabent
+        bl      sm2_montjscalarmul_alt_sm2_montjadd
+
+        cbnz    j, sm2_montjscalarmul_alt_mainloop
+
+// That's the end of the main loop, and we just need to copy the
+// result in "acc" to the output.
+
+        ldp     x0, x1, [acc]
+        stp     x0, x1, [res]
+        ldp     x0, x1, [acc+16]
+        stp     x0, x1, [res, #16]
+        ldp     x0, x1, [acc+32]
+        stp     x0, x1, [res, #32]
+        ldp     x0, x1, [acc+48]
+        stp     x0, x1, [res, #48]
+        ldp     x0, x1, [acc+64]
+        stp     x0, x1, [res, #64]
+        ldp     x0, x1, [acc+80]
+        stp     x0, x1, [res, #80]
+
+// Restore stack and registers and return
+
+        add     sp, sp, NSPACE
+        ldp     x21, x30, [sp], 16
+        ldp     x19, x20, [sp], 16
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+sm2_montjscalarmul_alt_sm2_montjadd:
+        sub     sp, sp, #0xe0
+        mov     x15, x0
+        mov     x16, x1
+        mov     x17, x2
+        ldp     x2, x3, [x16, #0x40]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x16, #0x50]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, hs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        lsl     x4, x8, #32
+        lsr     x5, x8, #32
+        subs    x2, x4, x8
+        sbc     x3, x5, xzr
+        subs    x9, x9, x2
+        sbcs    x10, x10, x3
+        sbcs    x11, x11, x4
+        sbc     x8, x8, x5
+        lsl     x4, x9, #32
+        lsr     x5, x9, #32
+        subs    x2, x4, x9
+        sbc     x3, x5, xzr
+        subs    x10, x10, x2
+        sbcs    x11, x11, x3
+        sbcs    x8, x8, x4
+        sbc     x9, x9, x5
+        lsl     x4, x10, #32
+        lsr     x5, x10, #32
+        subs    x2, x4, x10
+        sbc     x3, x5, xzr
+        subs    x11, x11, x2
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x4
+        sbc     x10, x10, x5
+        lsl     x4, x11, #32
+        lsr     x5, x11, #32
+        subs    x2, x4, x11
+        sbc     x3, x5, xzr
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, x4
+        sbc     x11, x11, x5
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        csetm   x2, hs
+        subs    x8, x8, x2
+        and     x3, x2, #0xffffffff00000000
+        sbcs    x9, x9, x3
+        and     x5, x2, #0xfffffffeffffffff
+        sbcs    x10, x10, x2
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #0x10]
+        ldp     x2, x3, [x17, #0x40]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x17, #0x50]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, hs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        lsl     x4, x8, #32
+        lsr     x5, x8, #32
+        subs    x2, x4, x8
+        sbc     x3, x5, xzr
+        subs    x9, x9, x2
+        sbcs    x10, x10, x3
+        sbcs    x11, x11, x4
+        sbc     x8, x8, x5
+        lsl     x4, x9, #32
+        lsr     x5, x9, #32
+        subs    x2, x4, x9
+        sbc     x3, x5, xzr
+        subs    x10, x10, x2
+        sbcs    x11, x11, x3
+        sbcs    x8, x8, x4
+        sbc     x9, x9, x5
+        lsl     x4, x10, #32
+        lsr     x5, x10, #32
+        subs    x2, x4, x10
+        sbc     x3, x5, xzr
+        subs    x11, x11, x2
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x4
+        sbc     x10, x10, x5
+        lsl     x4, x11, #32
+        lsr     x5, x11, #32
+        subs    x2, x4, x11
+        sbc     x3, x5, xzr
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, x4
+        sbc     x11, x11, x5
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        csetm   x2, hs
+        subs    x8, x8, x2
+        and     x3, x2, #0xffffffff00000000
+        sbcs    x9, x9, x3
+        and     x5, x2, #0xfffffffeffffffff
+        sbcs    x10, x10, x2
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp, #0xa0]
+        stp     x10, x11, [sp, #0xb0]
+        ldp     x3, x4, [x17, #0x40]
+        ldp     x7, x8, [x16, #0x20]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #0x30]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [x17, #0x50]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0xc0]
+        stp     x14, x0, [sp, #0xd0]
+        ldp     x3, x4, [x16, #0x40]
+        ldp     x7, x8, [x17, #0x20]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #0x30]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [x16, #0x50]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x20]
+        stp     x14, x0, [sp, #0x30]
+        ldp     x3, x4, [sp]
+        ldp     x7, x8, [x17]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #0x10]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0x10]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x40]
+        stp     x14, x0, [sp, #0x50]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x7, x8, [x16]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #0x10]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0xb0]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x80]
+        stp     x14, x0, [sp, #0x90]
+        ldp     x3, x4, [sp]
+        ldp     x7, x8, [sp, #0x20]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #0x30]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0x10]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x20]
+        stp     x14, x0, [sp, #0x30]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x7, x8, [sp, #0xc0]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #0xd0]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0xb0]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0xc0]
+        stp     x14, x0, [sp, #0xd0]
+        ldp     x5, x6, [sp, #0x40]
+        ldp     x4, x3, [sp, #0x80]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x50]
+        ldp     x4, x3, [sp, #0x90]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0xa0]
+        stp     x7, x8, [sp, #0xb0]
+        ldp     x5, x6, [sp, #0x20]
+        ldp     x4, x3, [sp, #0xc0]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x30]
+        ldp     x4, x3, [sp, #0xd0]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x20]
+        stp     x7, x8, [sp, #0x30]
+        ldp     x2, x3, [sp, #0xa0]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #0xb0]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, hs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        lsl     x4, x8, #32
+        lsr     x5, x8, #32
+        subs    x2, x4, x8
+        sbc     x3, x5, xzr
+        subs    x9, x9, x2
+        sbcs    x10, x10, x3
+        sbcs    x11, x11, x4
+        sbc     x8, x8, x5
+        lsl     x4, x9, #32
+        lsr     x5, x9, #32
+        subs    x2, x4, x9
+        sbc     x3, x5, xzr
+        subs    x10, x10, x2
+        sbcs    x11, x11, x3
+        sbcs    x8, x8, x4
+        sbc     x9, x9, x5
+        lsl     x4, x10, #32
+        lsr     x5, x10, #32
+        subs    x2, x4, x10
+        sbc     x3, x5, xzr
+        subs    x11, x11, x2
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x4
+        sbc     x10, x10, x5
+        lsl     x4, x11, #32
+        lsr     x5, x11, #32
+        subs    x2, x4, x11
+        sbc     x3, x5, xzr
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, x4
+        sbc     x11, x11, x5
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        csetm   x2, hs
+        subs    x8, x8, x2
+        and     x3, x2, #0xffffffff00000000
+        sbcs    x9, x9, x3
+        and     x5, x2, #0xfffffffeffffffff
+        sbcs    x10, x10, x2
+        sbc     x11, x11, x5
+        stp     x8, x9, [sp, #0x60]
+        stp     x10, x11, [sp, #0x70]
+        ldp     x2, x3, [sp, #0x20]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #0x30]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, hs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        lsl     x4, x8, #32
+        lsr     x5, x8, #32
+        subs    x2, x4, x8
+        sbc     x3, x5, xzr
+        subs    x9, x9, x2
+        sbcs    x10, x10, x3
+        sbcs    x11, x11, x4
+        sbc     x8, x8, x5
+        lsl     x4, x9, #32
+        lsr     x5, x9, #32
+        subs    x2, x4, x9
+        sbc     x3, x5, xzr
+        subs    x10, x10, x2
+        sbcs    x11, x11, x3
+        sbcs    x8, x8, x4
+        sbc     x9, x9, x5
+        lsl     x4, x10, #32
+        lsr     x5, x10, #32
+        subs    x2, x4, x10
+        sbc     x3, x5, xzr
+        subs    x11, x11, x2
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x4
+        sbc     x10, x10, x5
+        lsl     x4, x11, #32
+        lsr     x5, x11, #32
+        subs    x2, x4, x11
+        sbc     x3, x5, xzr
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, x4
+        sbc     x11, x11, x5
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, hs
+        mov     x3, #-0x100000000
+        mov     x5, #-0x100000001
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        adcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, lo
+        csel    x9, x9, x13, lo
+        csel    x10, x10, x14, lo
+        csel    x11, x11, x7, lo
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #0x10]
+        ldp     x3, x4, [sp, #0x60]
+        ldp     x7, x8, [sp, #0x80]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #0x90]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0x70]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x80]
+        stp     x14, x0, [sp, #0x90]
+        ldp     x3, x4, [sp, #0x60]
+        ldp     x7, x8, [sp, #0x40]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #0x50]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0x70]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x40]
+        stp     x14, x0, [sp, #0x50]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #0x80]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x10]
+        ldp     x4, x3, [sp, #0x90]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #0x10]
+        ldp     x5, x6, [sp, #0x40]
+        ldp     x4, x3, [sp, #0x80]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x50]
+        ldp     x4, x3, [sp, #0x90]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x60]
+        stp     x7, x8, [sp, #0x70]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x7, x8, [x16, #0x40]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x16, #0x50]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0xb0]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0xa0]
+        stp     x14, x0, [sp, #0xb0]
+        ldp     x5, x6, [sp]
+        ldp     x4, x3, [sp, #0x40]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x10]
+        ldp     x4, x3, [sp, #0x50]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp]
+        stp     x7, x8, [sp, #0x10]
+        ldp     x5, x6, [sp, #0x80]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x90]
+        ldp     x4, x3, [sp, #0x10]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x80]
+        stp     x7, x8, [sp, #0x90]
+        ldp     x3, x4, [sp, #0x60]
+        ldp     x7, x8, [sp, #0xc0]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #0xd0]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0x70]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x60]
+        stp     x14, x0, [sp, #0x70]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x7, x8, [x17, #0x40]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [x17, #0x50]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0xb0]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0xa0]
+        stp     x14, x0, [sp, #0xb0]
+        ldp     x3, x4, [sp, #0x20]
+        ldp     x7, x8, [sp, #0x80]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #0x90]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0x30]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x80]
+        stp     x14, x0, [sp, #0x90]
+        ldp     x5, x6, [sp, #0x80]
+        ldp     x4, x3, [sp, #0x60]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x90]
+        ldp     x4, x3, [sp, #0x70]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x80]
+        stp     x7, x8, [sp, #0x90]
+        ldp     x0, x1, [x16, #0x40]
+        ldp     x2, x3, [x16, #0x50]
+        orr     x12, x0, x1
+        orr     x13, x2, x3
+        orr     x12, x12, x13
+        cmp     x12, xzr
+        cset    x12, ne
+        ldp     x4, x5, [x17, #0x40]
+        ldp     x6, x7, [x17, #0x50]
+        orr     x13, x4, x5
+        orr     x14, x6, x7
+        orr     x13, x13, x14
+        cmp     x13, xzr
+        cset    x13, ne
+        cmp     x13, x12
+        ldp     x8, x9, [sp, #0xa0]
+        csel    x8, x0, x8, lo
+        csel    x9, x1, x9, lo
+        csel    x8, x4, x8, hi
+        csel    x9, x5, x9, hi
+        ldp     x10, x11, [sp, #0xb0]
+        csel    x10, x2, x10, lo
+        csel    x11, x3, x11, lo
+        csel    x10, x6, x10, hi
+        csel    x11, x7, x11, hi
+        ldp     x12, x13, [x16]
+        ldp     x0, x1, [sp]
+        csel    x0, x12, x0, lo
+        csel    x1, x13, x1, lo
+        ldp     x12, x13, [x17]
+        csel    x0, x12, x0, hi
+        csel    x1, x13, x1, hi
+        ldp     x12, x13, [x16, #0x10]
+        ldp     x2, x3, [sp, #0x10]
+        csel    x2, x12, x2, lo
+        csel    x3, x13, x3, lo
+        ldp     x12, x13, [x17, #0x10]
+        csel    x2, x12, x2, hi
+        csel    x3, x13, x3, hi
+        ldp     x12, x13, [x16, #0x20]
+        ldp     x4, x5, [sp, #0x80]
+        csel    x4, x12, x4, lo
+        csel    x5, x13, x5, lo
+        ldp     x12, x13, [x17, #0x20]
+        csel    x4, x12, x4, hi
+        csel    x5, x13, x5, hi
+        ldp     x12, x13, [x16, #0x30]
+        ldp     x6, x7, [sp, #0x90]
+        csel    x6, x12, x6, lo
+        csel    x7, x13, x7, lo
+        ldp     x12, x13, [x17, #0x30]
+        csel    x6, x12, x6, hi
+        csel    x7, x13, x7, hi
+        stp     x0, x1, [x15]
+        stp     x2, x3, [x15, #0x10]
+        stp     x4, x5, [x15, #0x20]
+        stp     x6, x7, [x15, #0x30]
+        stp     x8, x9, [x15, #0x40]
+        stp     x10, x11, [x15, #0x50]
+        add     sp, sp, #0xe0
+        ret
+
+sm2_montjscalarmul_alt_sm2_montjdouble:
+        sub     sp, sp, #0xc0
+        mov     x15, x0
+        mov     x16, x1
+        ldp     x2, x3, [x16, #0x40]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x16, #0x50]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, hs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        lsl     x4, x8, #32
+        lsr     x5, x8, #32
+        subs    x2, x4, x8
+        sbc     x3, x5, xzr
+        subs    x9, x9, x2
+        sbcs    x10, x10, x3
+        sbcs    x11, x11, x4
+        sbc     x8, x8, x5
+        lsl     x4, x9, #32
+        lsr     x5, x9, #32
+        subs    x2, x4, x9
+        sbc     x3, x5, xzr
+        subs    x10, x10, x2
+        sbcs    x11, x11, x3
+        sbcs    x8, x8, x4
+        sbc     x9, x9, x5
+        lsl     x4, x10, #32
+        lsr     x5, x10, #32
+        subs    x2, x4, x10
+        sbc     x3, x5, xzr
+        subs    x11, x11, x2
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x4
+        sbc     x10, x10, x5
+        lsl     x4, x11, #32
+        lsr     x5, x11, #32
+        subs    x2, x4, x11
+        sbc     x3, x5, xzr
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, x4
+        sbc     x11, x11, x5
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, hs
+        mov     x3, #-0x100000000
+        mov     x5, #-0x100000001
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        adcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, lo
+        csel    x9, x9, x13, lo
+        csel    x10, x10, x14, lo
+        csel    x11, x11, x7, lo
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #0x10]
+        ldp     x2, x3, [x16, #0x20]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [x16, #0x30]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, hs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        lsl     x4, x8, #32
+        lsr     x5, x8, #32
+        subs    x2, x4, x8
+        sbc     x3, x5, xzr
+        subs    x9, x9, x2
+        sbcs    x10, x10, x3
+        sbcs    x11, x11, x4
+        sbc     x8, x8, x5
+        lsl     x4, x9, #32
+        lsr     x5, x9, #32
+        subs    x2, x4, x9
+        sbc     x3, x5, xzr
+        subs    x10, x10, x2
+        sbcs    x11, x11, x3
+        sbcs    x8, x8, x4
+        sbc     x9, x9, x5
+        lsl     x4, x10, #32
+        lsr     x5, x10, #32
+        subs    x2, x4, x10
+        sbc     x3, x5, xzr
+        subs    x11, x11, x2
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x4
+        sbc     x10, x10, x5
+        lsl     x4, x11, #32
+        lsr     x5, x11, #32
+        subs    x2, x4, x11
+        sbc     x3, x5, xzr
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, x4
+        sbc     x11, x11, x5
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, hs
+        mov     x3, #-0x100000000
+        mov     x5, #-0x100000001
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        adcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, lo
+        csel    x9, x9, x13, lo
+        csel    x10, x10, x14, lo
+        csel    x11, x11, x7, lo
+        stp     x8, x9, [sp, #0x20]
+        stp     x10, x11, [sp, #0x30]
+        ldp     x5, x6, [x16]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [x16, #0x10]
+        ldp     x4, x3, [sp, #0x10]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x60]
+        stp     x7, x8, [sp, #0x70]
+        ldp     x4, x5, [x16]
+        ldp     x8, x9, [sp]
+        adds    x4, x4, x8
+        adcs    x5, x5, x9
+        ldp     x6, x7, [x16, #0x10]
+        ldp     x10, x11, [sp, #0x10]
+        adcs    x6, x6, x10
+        adcs    x7, x7, x11
+        csetm   x2, hs
+        subs    x4, x4, x2
+        and     x3, x2, #0xffffffff00000000
+        sbcs    x5, x5, x3
+        and     x1, x2, #0xfffffffeffffffff
+        sbcs    x6, x6, x2
+        sbc     x7, x7, x1
+        stp     x4, x5, [sp, #0x40]
+        stp     x6, x7, [sp, #0x50]
+        ldp     x3, x4, [sp, #0x40]
+        ldp     x7, x8, [sp, #0x60]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #0x70]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0x50]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x60]
+        stp     x14, x0, [sp, #0x70]
+        ldp     x4, x5, [x16, #0x20]
+        ldp     x8, x9, [x16, #0x40]
+        adds    x4, x4, x8
+        adcs    x5, x5, x9
+        ldp     x6, x7, [x16, #0x30]
+        ldp     x10, x11, [x16, #0x50]
+        adcs    x6, x6, x10
+        adcs    x7, x7, x11
+        adc     x3, xzr, xzr
+        adds    x8, x4, #0x1
+        mov     x9, #-0x100000000
+        sbcs    x9, x5, x9
+        adcs    x10, x6, xzr
+        mov     x11, #-0x100000001
+        sbcs    x11, x7, x11
+        sbcs    x3, x3, xzr
+        csel    x4, x4, x8, lo
+        csel    x5, x5, x9, lo
+        csel    x6, x6, x10, lo
+        csel    x7, x7, x11, lo
+        stp     x4, x5, [sp, #0x40]
+        stp     x6, x7, [sp, #0x50]
+        ldp     x3, x4, [x16]
+        ldp     x7, x8, [sp, #0x20]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #0x30]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [x16, #0x10]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x80]
+        stp     x14, x0, [sp, #0x90]
+        ldp     x2, x3, [sp, #0x60]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #0x70]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, hs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        lsl     x4, x8, #32
+        lsr     x5, x8, #32
+        subs    x2, x4, x8
+        sbc     x3, x5, xzr
+        subs    x9, x9, x2
+        sbcs    x10, x10, x3
+        sbcs    x11, x11, x4
+        sbc     x8, x8, x5
+        lsl     x4, x9, #32
+        lsr     x5, x9, #32
+        subs    x2, x4, x9
+        sbc     x3, x5, xzr
+        subs    x10, x10, x2
+        sbcs    x11, x11, x3
+        sbcs    x8, x8, x4
+        sbc     x9, x9, x5
+        lsl     x4, x10, #32
+        lsr     x5, x10, #32
+        subs    x2, x4, x10
+        sbc     x3, x5, xzr
+        subs    x11, x11, x2
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x4
+        sbc     x10, x10, x5
+        lsl     x4, x11, #32
+        lsr     x5, x11, #32
+        subs    x2, x4, x11
+        sbc     x3, x5, xzr
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, x4
+        sbc     x11, x11, x5
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, hs
+        mov     x3, #-0x100000000
+        mov     x5, #-0x100000001
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        adcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, lo
+        csel    x9, x9, x13, lo
+        csel    x10, x10, x14, lo
+        csel    x11, x11, x7, lo
+        stp     x8, x9, [sp, #0xa0]
+        stp     x10, x11, [sp, #0xb0]
+        ldp     x2, x3, [sp, #0x40]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #0x50]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, hs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        lsl     x4, x8, #32
+        lsr     x5, x8, #32
+        subs    x2, x4, x8
+        sbc     x3, x5, xzr
+        subs    x9, x9, x2
+        sbcs    x10, x10, x3
+        sbcs    x11, x11, x4
+        sbc     x8, x8, x5
+        lsl     x4, x9, #32
+        lsr     x5, x9, #32
+        subs    x2, x4, x9
+        sbc     x3, x5, xzr
+        subs    x10, x10, x2
+        sbcs    x11, x11, x3
+        sbcs    x8, x8, x4
+        sbc     x9, x9, x5
+        lsl     x4, x10, #32
+        lsr     x5, x10, #32
+        subs    x2, x4, x10
+        sbc     x3, x5, xzr
+        subs    x11, x11, x2
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x4
+        sbc     x10, x10, x5
+        lsl     x4, x11, #32
+        lsr     x5, x11, #32
+        subs    x2, x4, x11
+        sbc     x3, x5, xzr
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, x4
+        sbc     x11, x11, x5
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, hs
+        mov     x3, #-0x100000000
+        mov     x5, #-0x100000001
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        adcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, lo
+        csel    x9, x9, x13, lo
+        csel    x10, x10, x14, lo
+        csel    x11, x11, x7, lo
+        stp     x8, x9, [sp, #0x40]
+        stp     x10, x11, [sp, #0x50]
+        mov     x1, #0x9
+        mov     x2, #-0x1
+        ldp     x9, x10, [sp, #0xa0]
+        subs    x9, x2, x9
+        mov     x3, #-0x100000000
+        sbcs    x10, x3, x10
+        ldp     x11, x12, [sp, #0xb0]
+        sbcs    x11, x2, x11
+        mov     x4, #-0x100000001
+        sbc     x12, x4, x12
+        mul     x3, x1, x9
+        mul     x4, x1, x10
+        mul     x5, x1, x11
+        mul     x6, x1, x12
+        umulh   x9, x1, x9
+        umulh   x10, x1, x10
+        umulh   x11, x1, x11
+        umulh   x7, x1, x12
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, xzr
+        mov     x1, #0xc
+        ldp     x9, x10, [sp, #0x80]
+        mul     x8, x9, x1
+        umulh   x9, x9, x1
+        adds    x3, x3, x8
+        mul     x8, x10, x1
+        umulh   x10, x10, x1
+        adcs    x4, x4, x8
+        ldp     x11, x12, [sp, #0x90]
+        mul     x8, x11, x1
+        umulh   x11, x11, x1
+        adcs    x5, x5, x8
+        mul     x8, x12, x1
+        umulh   x12, x12, x1
+        adcs    x6, x6, x8
+        adc     x7, x7, xzr
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, x12
+        add     x7, x7, #0x1
+        lsl     x8, x7, #32
+        sub     x9, x8, x7
+        adds    x3, x3, x7
+        adcs    x4, x4, x9
+        adcs    x5, x5, xzr
+        adcs    x6, x6, x8
+        csetm   x7, lo
+        adds    x3, x3, x7
+        and     x9, x7, #0xffffffff00000000
+        adcs    x4, x4, x9
+        adcs    x5, x5, x7
+        and     x8, x7, #0xfffffffeffffffff
+        adc     x6, x6, x8
+        stp     x3, x4, [sp, #0xa0]
+        stp     x5, x6, [sp, #0xb0]
+        ldp     x5, x6, [sp, #0x40]
+        ldp     x4, x3, [sp]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x50]
+        ldp     x4, x3, [sp, #0x10]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [sp, #0x40]
+        stp     x7, x8, [sp, #0x50]
+        ldp     x2, x3, [sp, #0x20]
+        mul     x9, x2, x3
+        umulh   x10, x2, x3
+        ldp     x4, x5, [sp, #0x30]
+        mul     x11, x2, x5
+        umulh   x12, x2, x5
+        mul     x6, x2, x4
+        umulh   x7, x2, x4
+        adds    x10, x10, x6
+        adcs    x11, x11, x7
+        mul     x6, x3, x4
+        umulh   x7, x3, x4
+        adc     x7, x7, xzr
+        adds    x11, x11, x6
+        mul     x13, x4, x5
+        umulh   x14, x4, x5
+        adcs    x12, x12, x7
+        mul     x6, x3, x5
+        umulh   x7, x3, x5
+        adc     x7, x7, xzr
+        adds    x12, x12, x6
+        adcs    x13, x13, x7
+        adc     x14, x14, xzr
+        adds    x9, x9, x9
+        adcs    x10, x10, x10
+        adcs    x11, x11, x11
+        adcs    x12, x12, x12
+        adcs    x13, x13, x13
+        adcs    x14, x14, x14
+        cset    x7, hs
+        umulh   x6, x2, x2
+        mul     x8, x2, x2
+        adds    x9, x9, x6
+        mul     x6, x3, x3
+        adcs    x10, x10, x6
+        umulh   x6, x3, x3
+        adcs    x11, x11, x6
+        mul     x6, x4, x4
+        adcs    x12, x12, x6
+        umulh   x6, x4, x4
+        adcs    x13, x13, x6
+        mul     x6, x5, x5
+        adcs    x14, x14, x6
+        umulh   x6, x5, x5
+        adc     x7, x7, x6
+        lsl     x4, x8, #32
+        lsr     x5, x8, #32
+        subs    x2, x4, x8
+        sbc     x3, x5, xzr
+        subs    x9, x9, x2
+        sbcs    x10, x10, x3
+        sbcs    x11, x11, x4
+        sbc     x8, x8, x5
+        lsl     x4, x9, #32
+        lsr     x5, x9, #32
+        subs    x2, x4, x9
+        sbc     x3, x5, xzr
+        subs    x10, x10, x2
+        sbcs    x11, x11, x3
+        sbcs    x8, x8, x4
+        sbc     x9, x9, x5
+        lsl     x4, x10, #32
+        lsr     x5, x10, #32
+        subs    x2, x4, x10
+        sbc     x3, x5, xzr
+        subs    x11, x11, x2
+        sbcs    x8, x8, x3
+        sbcs    x9, x9, x4
+        sbc     x10, x10, x5
+        lsl     x4, x11, #32
+        lsr     x5, x11, #32
+        subs    x2, x4, x11
+        sbc     x3, x5, xzr
+        subs    x8, x8, x2
+        sbcs    x9, x9, x3
+        sbcs    x10, x10, x4
+        sbc     x11, x11, x5
+        adds    x8, x8, x12
+        adcs    x9, x9, x13
+        adcs    x10, x10, x14
+        adcs    x11, x11, x7
+        cset    x2, hs
+        mov     x3, #-0x100000000
+        mov     x5, #-0x100000001
+        adds    x12, x8, #0x1
+        sbcs    x13, x9, x3
+        adcs    x14, x10, xzr
+        sbcs    x7, x11, x5
+        sbcs    xzr, x2, xzr
+        csel    x8, x8, x12, lo
+        csel    x9, x9, x13, lo
+        csel    x10, x10, x14, lo
+        csel    x11, x11, x7, lo
+        stp     x8, x9, [sp]
+        stp     x10, x11, [sp, #0x10]
+        ldp     x3, x4, [sp, #0xa0]
+        ldp     x7, x8, [sp, #0x60]
+        mul     x12, x3, x7
+        umulh   x13, x3, x7
+        mul     x11, x3, x8
+        umulh   x14, x3, x8
+        adds    x13, x13, x11
+        ldp     x9, x10, [sp, #0x70]
+        mul     x11, x3, x9
+        umulh   x0, x3, x9
+        adcs    x14, x14, x11
+        mul     x11, x3, x10
+        umulh   x1, x3, x10
+        adcs    x0, x0, x11
+        adc     x1, x1, xzr
+        ldp     x5, x6, [sp, #0xb0]
+        mul     x11, x4, x7
+        adds    x13, x13, x11
+        mul     x11, x4, x8
+        adcs    x14, x14, x11
+        mul     x11, x4, x9
+        adcs    x0, x0, x11
+        mul     x11, x4, x10
+        adcs    x1, x1, x11
+        umulh   x3, x4, x10
+        adc     x3, x3, xzr
+        umulh   x11, x4, x7
+        adds    x14, x14, x11
+        umulh   x11, x4, x8
+        adcs    x0, x0, x11
+        umulh   x11, x4, x9
+        adcs    x1, x1, x11
+        adc     x3, x3, xzr
+        mul     x11, x5, x7
+        adds    x14, x14, x11
+        mul     x11, x5, x8
+        adcs    x0, x0, x11
+        mul     x11, x5, x9
+        adcs    x1, x1, x11
+        mul     x11, x5, x10
+        adcs    x3, x3, x11
+        umulh   x4, x5, x10
+        adc     x4, x4, xzr
+        umulh   x11, x5, x7
+        adds    x0, x0, x11
+        umulh   x11, x5, x8
+        adcs    x1, x1, x11
+        umulh   x11, x5, x9
+        adcs    x3, x3, x11
+        adc     x4, x4, xzr
+        mul     x11, x6, x7
+        adds    x0, x0, x11
+        mul     x11, x6, x8
+        adcs    x1, x1, x11
+        mul     x11, x6, x9
+        adcs    x3, x3, x11
+        mul     x11, x6, x10
+        adcs    x4, x4, x11
+        umulh   x5, x6, x10
+        adc     x5, x5, xzr
+        umulh   x11, x6, x7
+        adds    x1, x1, x11
+        umulh   x11, x6, x8
+        adcs    x3, x3, x11
+        umulh   x11, x6, x9
+        adcs    x4, x4, x11
+        adc     x5, x5, xzr
+        lsl     x11, x12, #32
+        lsr     x6, x12, #32
+        subs    x8, x11, x12
+        sbc     x7, x6, xzr
+        subs    x13, x13, x8
+        sbcs    x14, x14, x7
+        sbcs    x0, x0, x11
+        sbc     x12, x12, x6
+        lsl     x11, x13, #32
+        lsr     x6, x13, #32
+        subs    x8, x11, x13
+        sbc     x7, x6, xzr
+        subs    x14, x14, x8
+        sbcs    x0, x0, x7
+        sbcs    x12, x12, x11
+        sbc     x13, x13, x6
+        lsl     x11, x14, #32
+        lsr     x6, x14, #32
+        subs    x8, x11, x14
+        sbc     x7, x6, xzr
+        subs    x0, x0, x8
+        sbcs    x12, x12, x7
+        sbcs    x13, x13, x11
+        sbc     x14, x14, x6
+        lsl     x11, x0, #32
+        lsr     x6, x0, #32
+        subs    x8, x11, x0
+        sbc     x7, x6, xzr
+        subs    x12, x12, x8
+        sbcs    x13, x13, x7
+        sbcs    x14, x14, x11
+        sbc     x0, x0, x6
+        adds    x12, x12, x1
+        adcs    x13, x13, x3
+        adcs    x14, x14, x4
+        adcs    x0, x0, x5
+        cset    x8, hs
+        mov     x11, #-0x100000000
+        mov     x6, #-0x100000001
+        adds    x1, x12, #0x1
+        sbcs    x3, x13, x11
+        adcs    x4, x14, xzr
+        sbcs    x5, x0, x6
+        sbcs    xzr, x8, xzr
+        csel    x12, x12, x1, lo
+        csel    x13, x13, x3, lo
+        csel    x14, x14, x4, lo
+        csel    x0, x0, x5, lo
+        stp     x12, x13, [sp, #0x60]
+        stp     x14, x0, [sp, #0x70]
+        ldp     x5, x6, [sp, #0x40]
+        ldp     x4, x3, [sp, #0x20]
+        subs    x5, x5, x4
+        sbcs    x6, x6, x3
+        ldp     x7, x8, [sp, #0x50]
+        ldp     x4, x3, [sp, #0x30]
+        sbcs    x7, x7, x4
+        sbcs    x8, x8, x3
+        csetm   x3, lo
+        adds    x5, x5, x3
+        and     x4, x3, #0xffffffff00000000
+        adcs    x6, x6, x4
+        adcs    x7, x7, x3
+        and     x4, x3, #0xfffffffeffffffff
+        adc     x8, x8, x4
+        stp     x5, x6, [x15, #0x40]
+        stp     x7, x8, [x15, #0x50]
+        ldp     x1, x2, [sp, #0x80]
+        lsl     x0, x1, #2
+        ldp     x6, x7, [sp, #0xa0]
+        subs    x0, x0, x6
+        extr    x1, x2, x1, #0x3e
+        sbcs    x1, x1, x7
+        ldp     x3, x4, [sp, #0x90]
+        extr    x2, x3, x2, #0x3e
+        ldp     x6, x7, [sp, #0xb0]
+        sbcs    x2, x2, x6
+        extr    x3, x4, x3, #0x3e
+        sbcs    x3, x3, x7
+        lsr     x4, x4, #62
+        sbc     x4, x4, xzr
+        add     x4, x4, #0x1
+        lsl     x5, x4, #32
+        sub     x6, x5, x4
+        adds    x0, x0, x4
+        adcs    x1, x1, x6
+        adcs    x2, x2, xzr
+        adcs    x3, x3, x5
+        csetm   x4, lo
+        adds    x0, x0, x4
+        and     x6, x4, #0xffffffff00000000
+        adcs    x1, x1, x6
+        adcs    x2, x2, x4
+        and     x5, x4, #0xfffffffeffffffff
+        adc     x3, x3, x5
+        stp     x0, x1, [x15]
+        stp     x2, x3, [x15, #0x10]
+        mov     x1, #0x8
+        mov     x2, #-0x1
+        ldp     x9, x10, [sp]
+        subs    x9, x2, x9
+        mov     x3, #-0x100000000
+        sbcs    x10, x3, x10
+        ldp     x11, x12, [sp, #0x10]
+        sbcs    x11, x2, x11
+        mov     x4, #-0x100000001
+        sbc     x12, x4, x12
+        lsl     x3, x9, #3
+        extr    x4, x10, x9, #0x3d
+        extr    x5, x11, x10, #0x3d
+        extr    x6, x12, x11, #0x3d
+        lsr     x7, x12, #61
+        mov     x1, #0x3
+        ldp     x9, x10, [sp, #0x60]
+        mul     x8, x9, x1
+        umulh   x9, x9, x1
+        adds    x3, x3, x8
+        mul     x8, x10, x1
+        umulh   x10, x10, x1
+        adcs    x4, x4, x8
+        ldp     x11, x12, [sp, #0x70]
+        mul     x8, x11, x1
+        umulh   x11, x11, x1
+        adcs    x5, x5, x8
+        mul     x8, x12, x1
+        umulh   x12, x12, x1
+        adcs    x6, x6, x8
+        adc     x7, x7, xzr
+        adds    x4, x4, x9
+        adcs    x5, x5, x10
+        adcs    x6, x6, x11
+        adc     x7, x7, x12
+        add     x7, x7, #0x1
+        lsl     x8, x7, #32
+        sub     x9, x8, x7
+        adds    x3, x3, x7
+        adcs    x4, x4, x9
+        adcs    x5, x5, xzr
+        adcs    x6, x6, x8
+        csetm   x7, lo
+        adds    x3, x3, x7
+        and     x9, x7, #0xffffffff00000000
+        adcs    x4, x4, x9
+        adcs    x5, x5, x7
+        and     x8, x7, #0xfffffffeffffffff
+        adc     x6, x6, x8
+        stp     x3, x4, [x15, #0x20]
+        stp     x5, x6, [x15, #0x30]
+        add     sp, sp, #0xc0
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/README.md b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/README.md
new file mode 100644
index 00000000000..23697bc3b7c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/README.md
@@ -0,0 +1,23 @@
+# Tutorials for s2n-bignum
+
+This directory includes examples for verifying Arm programs using s2n-bignum
+and HOL Light.
+To verify programs in x86, see `x86/tutorial`.
+
+### Unary reasoning
+
+1. `simple.ml`: Verifying a simple arithmetic property of a linear program.
+2. `sequence.ml`: Verifying a program by splitting into smaller chunks.
+3. `branch.ml`: Verifying a program that has a conditional branch.
+4. `memory.ml`: Verifying a program that manipulates a memory.
+5. `loop.ml`: Verifying a program that has a simple loop.
+6. `bignum.ml`: Writing a specification of a program dealing with big numbers & proving it.
+7. `rodata.ml`: Reading data from the read-only section.
+
+### Relational reasoning
+
+1. `rel_simp.ml`: Proving equivalence of two simple programs.
+2. `rel_equivtac.ml`: Proving equivalence of two programs that have small differences.
+3. `rel_reordertac.ml`: Proving equivalence of two programs where the second one has instructions reordered from that of the first one.
+4. `rel_loop.ml`: Proving equivalence of two simple loops.
+5. `rel_veceq.ml`: Proving equivalence of scalar vs. vectorized 128x128->256-bit squaring.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.S
new file mode 100644
index 00000000000..0d72c06c338
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.S
@@ -0,0 +1,11 @@
+  ldp x2, x3, [x0]
+  ldp x4, x5, [x1]
+  cmp x2, x4
+  bne bb_false
+  cmp x3, x5
+  bne bb_false
+  mov x0, #1
+  ret
+bb_false:
+  mov x0, xzr
+  ret
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.ml
new file mode 100644
index 00000000000..a445b6d76ef
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.ml
@@ -0,0 +1,159 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+  An example that shows how to describe big numbers in a specification.
+******************************************************************************)
+
+needs "arm/proofs/base.ml";;
+
+(* Let's prove that the following program
+
+   0:   a9400c02        ldp     x2, x3, [x0]
+   4:   a9401424        ldp     x4, x5, [x1]
+   8:   eb04005f        cmp     x2, x4
+   c:   540000a1        b.ne    20 <bb_false>  // b.any
+  10:   eb05007f        cmp     x3, x5
+  14:   54000061        b.ne    20 <bb_false>  // b.any
+  18:   d2800020        mov     x0, #0x1
+  1c:   d65f03c0        ret
+
+0000000000000020 <bb_false>:
+  20:   aa1f03e0        mov     x0, xzr
+  24:   d65f03c0        ret
+
+  .. returns 1 to x0 if a pair of 16-byte integers at buffer x0 and x1
+  are equal, 0 otherwise.
+  Since this example uses 128 bit integers, we will use 'bignum_from_memory'
+  which will state that reading a memory buffer of a specified word number will
+  return some large natural number.
+*)
+let bignum_mc = define_assert_from_elf "bignum_mc" "arm/tutorial/bignum.o" [
+  0xa9400c02;       (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&0))) *)
+  0xa9401424;       (* arm_LDP X4 X5 X1 (Immediate_Offset (iword (&0))) *)
+  0xeb04005f;       (* arm_CMP X2 X4 *)
+  0x540000a1;       (* arm_BNE (word 20) *)
+  0xeb05007f;       (* arm_CMP X3 X5 *)
+  0x54000061;       (* arm_BNE (word 12) *)
+  0xd2800020;       (* arm_MOV X0 (rvalue (word 1)) *)
+  0xd65f03c0;       (* arm_RET X30 *)
+  0xaa1f03e0;       (* arm_MOV X0 XZR *)
+  0xd65f03c0        (* arm_RET X30 *)
+];;
+
+(*
+You can get the above OCaml list data structure from
+`print_literal_from_elf "<.o file>"` or
+`save_literal_from_elf "<out.txt>" "<.o file>"`.
+*)
+
+(* ARM_MK_EXEC_RULE decodes the byte sequence into conjunction of
+  equalities between the bytes and instructions. *)
+let EXEC = ARM_MK_EXEC_RULE bignum_mc;;
+
+let BIGNUM_SPEC = prove(
+  `forall pc retpc loc0 loc1 a b.
+  ensures arm
+    // Precondition
+    (\s. aligned_bytes_loaded s (word pc) bignum_mc /\
+         read PC s = word pc /\
+         read X30 s = word retpc /\
+         read X0 s = word loc0 /\
+         read X1 s = word loc1 /\
+         // Read 2 words (=128bits) at loc0. It is equivalent to num a.
+         // Alternatively, this kind of condition can be written using
+         // bignum_of_wordlist which takes a list of 64-bit words.
+         bignum_from_memory (word loc0,2) s = a /\
+         // Read 2 words (=128bits) at loc1. It is equivalent to num b.
+         bignum_from_memory (word loc1,2) s = b
+         )
+    // Postcondition
+    (\s. read PC s = word retpc /\
+         read X0 s = word (if a = b then 1 else 0))
+    // Registers (and memory locations) that may change after execution
+    (MAYCHANGE [PC;X0;X2;X3;X4;X5] ,, MAYCHANGE SOME_FLAGS ,,
+     MAYCHANGE [events])`,
+
+  REPEAT STRIP_TAC THEN
+  (* Convert 'bignum_from_memory' into 'memory :> bytes (..)'.
+     Also, expand SOME_FLAGS *)
+  REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES;SOME_FLAGS] THEN
+  (* Start symbolic execution with state 's0' *)
+  ENSURES_INIT_TAC "s0" THEN
+  (* Split the memory :> bytes .. into a pair of memory :> bytes64.
+     This is necessary to successfully encode the symbolic result of ldps. *)
+  BIGNUM_DIGITIZE_TAC "a_" `read (memory :> bytes (word loc0,8 * 2)) s0` THEN
+  BIGNUM_DIGITIZE_TAC "b_" `read (memory :> bytes (word loc1,8 * 2)) s0` THEN
+
+  (* Symbolically run two ldp instructions *)
+  ARM_STEPS_TAC EXEC (1--2) THEN
+  (* Until first 'bne' *)
+  ARM_STEPS_TAC EXEC (3--4) THEN
+
+  (* Recognize the if condition and create two subgoals . *)
+  FIRST_X_ASSUM MP_TAC THEN
+  COND_CASES_TAC THENL [
+    (* The low 64 bits of a and b are different. *)
+    STRIP_TAC THEN
+    ARM_STEPS_TAC EXEC (5--6) THEN
+    (* Returned; Finalize symbolic execution. *)
+    ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN
+    (* From `~(val (word_sub a_0 b_0) = 0)` and `val a_0 + 2 EXP 64 * val a_1 = a`,
+       and `val b_0 + 2 EXP 64 * val b_1 = b`,
+       prove `~(a = b)`. *)
+    SUBGOAL_THEN `~(a:num = b)` (fun th -> REWRITE_TAC[th]) THEN
+    MAP_EVERY EXPAND_TAC ["a";"b"] THEN
+    (* VAL_WORD_SUB_EQ_0: |- !x y. val (word_sub x y) = 0 <=> val x = val y) *)
+    RULE_ASSUM_TAC (REWRITE_RULE [VAL_WORD_SUB_EQ_0]) THEN
+    (* EQ_DIVMOD: |- !p m n. m DIV p = n DIV p /\ m MOD p = n MOD p <=> m = n *)
+    ONCE_REWRITE_TAC[SPEC `2 EXP 64` (GSYM EQ_DIVMOD)] THEN
+    (* The first '.. DIV .. = .. DIV ..' part is irelevant. *)
+    MATCH_MP_TAC (TAUT (`~Q ==> ~(P /\ Q)`)) THEN
+    (* Simplfy! *)
+    SIMP_TAC[MOD_MULT_ADD;VAL_BOUND_64;ARITH_RULE`~(2 EXP 64 = 0)`] THEN
+    ASM_SIMP_TAC[MOD_LT;VAL_BOUND_64];
+
+    ALL_TAC
+  ] THEN
+
+  (* The low 64 bits of a and b are equivalent. *)
+  (* Until the second 'bne' *)
+  STRIP_TAC THEN
+  ARM_STEPS_TAC EXEC (5--6) THEN
+
+  (* Recognize the if condition and create two subgoals . *)
+  FIRST_X_ASSUM MP_TAC THEN
+  COND_CASES_TAC THENL [
+    (* The high 64 bits of a and b are different. *)
+    STRIP_TAC THEN
+    ARM_STEPS_TAC EXEC (7--8) THEN
+    (* Returned; Finalize symbolic execution. *)
+    ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN
+    (* Proof pattern is similar to the first branch case *)
+    SUBGOAL_THEN `~(a:num = b)` (fun th -> REWRITE_TAC[th]) THEN
+    MAP_EVERY EXPAND_TAC ["a";"b"] THEN
+    (* VAL_WORD_SUB_EQ_0: |- !x y. val (word_sub x y) = 0 <=> val x = val y) *)
+    RULE_ASSUM_TAC (REWRITE_RULE [VAL_WORD_SUB_EQ_0]) THEN
+    (* EQ_DIVMOD: |- !p m n. m DIV p = n DIV p /\ m MOD p = n MOD p <=> m = n *)
+    ONCE_REWRITE_TAC[SPEC `2 EXP 64` (GSYM EQ_DIVMOD)] THEN
+    (* The second '.. MOD .. = .. MOD ..' part is irelevant. *)
+    MATCH_MP_TAC (TAUT (`~P ==> ~(P /\ Q)`)) THEN
+    (* Simplfy! *)
+    SIMP_TAC[DIV_MULT_ADD;VAL_BOUND_64;ARITH_RULE`~(2 EXP 64 = 0)`] THEN
+    ASM_SIMP_TAC[DIV_LT;VAL_BOUND_64;ADD_CLAUSES];
+
+    ALL_TAC
+  ] THEN
+
+  (* Both limbs are equivalent! *)
+  STRIP_TAC THEN
+  ARM_STEPS_TAC EXEC (7--8) THEN
+  (* Try to prove the postcondition and frame as much as possible *)
+  ENSURES_FINAL_STATE_TAC THEN
+  (* Use ASM_REWRITE_TAC[] to rewrite the goal using equalities in assumptions. *)
+  ASM_REWRITE_TAC[] THEN
+  SUBGOAL_THEN `(a:num = b)` (fun th -> REWRITE_TAC[th]) THEN
+  RULE_ASSUM_TAC (REWRITE_RULE [VAL_WORD_SUB_EQ_0]) THEN
+  ASM_ARITH_TAC);;
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.S
new file mode 100644
index 00000000000..52f8d1f0ade
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.S
@@ -0,0 +1,7 @@
+  cmp x1, x2
+  b.hi BB2
+  mov x0, x2
+  ret
+BB2:
+  mov x0, x1
+  ret
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.ml
new file mode 100644
index 00000000000..284cdfcc3f0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.ml
@@ -0,0 +1,119 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+  Prove a property of a simple program that has a conditional branch.
+******************************************************************************)
+
+needs "arm/proofs/base.ml";;
+
+(* The following program
+  0:   eb02003f        cmp     x1, x2
+  4:   54000068        b.hi    10 <BB2>
+  8:   aa0203e0        mov     x0, x2
+  c:   d65f03c0        ret
+
+0000000000000010 <BB2>:
+  10:   aa0103e0        mov     x0, x1
+  14:   d65f03c0        ret
+
+  .. copies max(x1,x2) to x0 and returns to the caller.
+  Let's prove this property.
+*)
+
+let branch_mc = new_definition `branch_mc = [
+    word 0x3f; word 0x00; word 0x02; word 0xeb; // cmp     x1, x2
+    word 0x68; word 0x00; word 0x00; word 0x54; // b.hi    10 <BB2>
+
+    word 0xe0; word 0x03; word 0x02; word 0xaa; // mov     x0, x2
+    word 0xc0; word 0x03; word 0x5f; word 0xd6; // ret
+
+  // BB2:
+    word 0xe0; word 0x03; word 0x01; word 0xaa; // mov     x0, x1
+    word 0xc0; word 0x03; word 0x5f; word 0xd6  // ret
+  ]:((8)word)list`;;
+
+let EXEC = ARM_MK_EXEC_RULE branch_mc;;
+
+let branch_SPEC = prove(
+  `forall pc pcret a b.
+  ensures arm
+    // Precondition
+    (\s. aligned_bytes_loaded s (word pc) branch_mc /\
+         read X30 s = word pcret /\
+         read PC s = word pc /\
+         read X1 s = word a /\
+         read X2 s = word b)
+    // Postcondition
+    (\s. read PC s = word pcret /\
+         read X0 s = word_umax (word a) (word b))
+    // Registers (and memory locations) that may change after execution.
+    // ',,' is composition of relations.
+    (MAYCHANGE [PC;X0] ,, MAYCHANGE SOME_FLAGS ,,
+     // Branch instructions raise observable microarchitectural events!
+     MAYCHANGE [events])`,
+  (* Strips the outermost universal quantifier from the conclusion of a goal *)
+  REPEAT STRIP_TAC THEN
+  (* ENSURES_FINAL_STATE_TAC does not understand SOME_FLAGS in MAYCHANGE. Let's
+     unfold this in advance. *)
+  REWRITE_TAC [SOME_FLAGS] THEN
+
+  (* Let's do symbolic execution until it hits the branch instruction. *)
+  ENSURES_INIT_TAC "s0" THEN
+  ARM_STEPS_TAC EXEC (1--2) THEN
+
+  (* The PC has the following symbolic expression:
+    `read PC s2 =
+      (if val (word b) <= val (word a) /\
+          ~(val (word_sub (word a) (word b)) = 0)
+       then word (pc + 16)
+       else word (pc + 8))`
+    Let's do case analysis on the condition of this if expression.
+
+    First, move this assumption to the antecendent of the goal so the goal
+    becomes:
+      (read PC s2 = ...) ==> eventually arm ...
+  *)
+  FIRST_X_ASSUM MP_TAC THEN
+
+  (* Recognize the if condition and create two subgoals . *)
+  COND_CASES_TAC THENL [
+    (** Case 1: if the branch was taken! **)
+    (* Let's name the hypothesis first. *)
+    POP_ASSUM (LABEL_TAC "Hcond") THEN
+    DISCH_TAC THEN
+
+    (* Do symbolic execution on the remaining two insts. *)
+    ARM_STEPS_TAC EXEC (3--4) THEN
+    ENSURES_FINAL_STATE_TAC THEN
+    ASM_REWRITE_TAC[] THEN
+
+    (* The remaining goal is `word a = word (MAX a b).` *)
+    REMOVE_THEN "Hcond" MP_TAC THEN
+    (* WORD_UMAX: `!x y. word_umax x y = (if val x <= val y then y else x)`
+       VAL_WORD_SUB_EQ_0: `!x y. val (word_sub x y) = 0 <=> val x = val y` *)
+    REWRITE_TAC[WORD_UMAX;VAL_WORD_SUB_EQ_0] THEN
+    (* Let ARITH_TAC deal with reasoning on relational equations. *)
+    ARITH_TAC;
+
+
+    (** Case 2: if the branch was not taken! **)
+    (* Let's name the hypothesis first. *)
+    POP_ASSUM (LABEL_TAC "Hcond") THEN
+    DISCH_TAC THEN
+
+    (* Do symbolic execution on the remaining two insts. *)
+    ARM_STEPS_TAC EXEC (3--4) THEN
+    ENSURES_FINAL_STATE_TAC THEN
+    ASM_REWRITE_TAC[] THEN
+
+    (* The remaining goal is `word b = word (MAX a b).` *)
+    REMOVE_THEN "Hcond" MP_TAC THEN
+    (* WORD_UMAX: `!x y. word_umax x y = (if val x <= val y then y else x)`
+       VAL_WORD_SUB_EQ_0: `!x y. val (word_sub x y) = 0 <=> val x = val y` *)
+    REWRITE_TAC[WORD_UMAX;VAL_WORD_SUB_EQ_0] THEN
+    (* Let ARITH_TAC deal with reasoning on relational equations. *)
+    ARITH_TAC;
+  ]);;
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.S
new file mode 100644
index 00000000000..5872b164679
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.S
@@ -0,0 +1,10 @@
+  mov x1, xzr
+  mov x0, xzr
+
+loop:
+  add x1, x1, #1
+  add x0, x0, #2
+  cmp x1, #10
+  bne loop
+
+  ret
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.ml
new file mode 100644
index 00000000000..0fb56c44678
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.ml
@@ -0,0 +1,123 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+  Prove a property of a simple program that has a loop.
+******************************************************************************)
+
+needs "arm/proofs/base.ml";;
+
+(* The following program
+   0:   aa1f03e1        mov     x1, xzr
+   4:   aa1f03e0        mov     x0, xzr
+
+0000000000000008 <loop>:
+   8:   91000421        add     x1, x1, #0x1
+   c:   91000800        add     x0, x0, #0x2
+  10:   f100283f        cmp     x1, #0xa
+  14:   54ffffa1        b.ne    8 <loop>  // b.any
+  18:   d65f03c0        ret
+
+  increments x0 until its value is 20.
+  Let's prove that this function returns 20.
+*)
+let loop_mc = new_definition `loop_mc = [
+  word 0xe1; word 0x03; word 0x1f; word 0xaa; // mov     x1, xzr
+  word 0xe0; word 0x03; word 0x1f; word 0xaa; // mov     x0, xzr
+
+// loop:
+  word 0x21; word 0x04; word 0x00; word 0x91; // add     x1, x1, #0x1
+  word 0x00; word 0x08; word 0x00; word 0x91; // add     x0, x0, #0x2
+  word 0x3f; word 0x28; word 0x00; word 0xf1; // cmp     x1, #0xa
+  word 0xa1; word 0xff; word 0xff; word 0x54; // b.ne    8 <loop>
+  word 0xc0; word 0x03; word 0x5f; word 0xd6  // ret
+]:((8)word)list`;;
+
+let EXEC = ARM_MK_EXEC_RULE loop_mc;;
+
+let loop_SPEC = prove(
+  `forall pc retpc.
+  ensures arm
+    // Precondition
+    (\s. aligned_bytes_loaded s (word pc) loop_mc /\
+         read PC s = word pc /\
+         read X30 s = word retpc)
+    // Postcondition
+    (\s. read PC s = word retpc /\
+         read X0 s = word 20)
+    // Registers (and memory locations) that may change after execution
+    (MAYCHANGE [PC;X0;X1] ,, MAYCHANGE SOME_FLAGS ,,
+     // Branch instructions raise observable microarchitectural events!
+     MAYCHANGE [events])`,
+  (* Unravel ARM flag registers! *)
+  REWRITE_TAC[SOME_FLAGS] THEN
+  REPEAT STRIP_TAC THEN
+
+  (* ENSURES_WHILE_PAUP_TAC is one of several tactics for declaring a hoare triple of a loop.
+     PAUP means:
+     - "P": The loop ends with a flag-setting instruction such as 'cmp' or 'adds'.
+            'read ZF s <=> i = 10' in the below statement relates the flag with
+            the loop counter.
+     - "A": The loop counter starts from variable 'a', In this tactic, this is 0.
+            Actually, when a = 0, you can also use ENSURES_WHILE_PUP_TAC.
+     - "UP": The counter goes up. *)
+  ENSURES_WHILE_PAUP_TAC
+    `0` (* counter begin number *)
+    `10` (* counter end number *)
+    `pc + 8` (* loop body start PC *)
+    `pc + 0x14` (* loop backedge branch PC *)
+    `\i s. // loop invariant at the end of the loop
+           (read X1 s = word i /\ read X0 s = word (i*2) /\ read X30 s = word retpc) /\
+           // loop backedge condition
+           (read ZF s <=> i = 10)` THEN
+  REPEAT CONJ_TAC THENL [
+    (* counter begin < counter end *)
+    ARITH_TAC;
+
+    (* entrance to the loop *)
+    (* Let's use ARM_SIM_TAC which is ENSURES_INIT_TAC + ARM_STEPS_TAC +
+       ENSURES_FINAL_STATE_TAC + some post-processing. *)
+    ARM_SIM_TAC EXEC (1--2) THEN
+    CONV_TAC WORD_RULE;
+
+    (* The loop body. let's prove this later. *)
+    (* If you are interactively exploring this proof, try `r 1;;`. *)
+    ALL_TAC;
+
+    (* Prove that backedge is taken if i != 10. *)
+    REPEAT STRIP_TAC THEN
+    ARM_SIM_TAC EXEC [1];
+
+    (* Loop exit to the end of the program *)
+    ARM_SIM_TAC EXEC (1--2) THEN
+    (* word (10*2) = word 20 *)
+    CONV_TAC WORD_RULE
+  ] THEN
+
+  (* The loop body *)
+  REPEAT STRIP_TAC THEN
+  ARM_SIM_TAC EXEC (1--3) THEN
+  REPEAT CONJ_TAC THENL [
+    (* `word_add (word i) (word 1) = word (i + 1)` *)
+    CONV_TAC WORD_RULE;
+
+    (* `word_add (word (i * 2)) (word 2) = word ((i + 1) * 2)` *)
+    CONV_TAC WORD_RULE;
+
+    (* `val (word_add (word i) (word 18446744073709551607)) = 0 <=> i + 1 = 10` *)
+    (* This goal is slightly complicated to prove using automatic solvers.
+       Let's manually attack this. *)
+    (* Yes, we also have 'WORD_BLAST' that works like bit-blasting. *)
+    REWRITE_TAC [WORD_BLAST `word_add x (word 18446744073709551607):int64 =
+                             word_sub x (word 9)`] THEN
+    REWRITE_TAC[VAL_WORD_SUB_EQ_0] THEN
+    REWRITE_TAC[VAL_WORD;DIMINDEX_64] THEN
+    (* Rewrite all '_ MOD 2 EXP 64' to '_' because they are known to be less
+       than 2 EXP 64. *)
+    IMP_REWRITE_TAC[MOD_LT; ARITH_RULE`9 < 2 EXP 64`] THEN
+    CONJ_TAC THEN (* will create two arithmetic subgoals. *)
+    (* both goals can be solved using ASM_ARITH_TAC. *)
+    ASM_ARITH_TAC
+  ]);;
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.S
new file mode 100644
index 00000000000..dd340152473
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.S
@@ -0,0 +1,4 @@
+ldr x2, [x0]
+ldr x3, [x1]
+str x2, [x1]
+str x3, [x0]
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.ml
new file mode 100644
index 00000000000..3e3f3275295
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.ml
@@ -0,0 +1,76 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+  Prove a property of a simple program that reads from and writes to
+  the memory.
+******************************************************************************)
+
+needs "arm/proofs/base.ml";;
+
+(* The following program
+  0:   f9400002        ldr     x2, [x0]
+  4:   f9400023        ldr     x3, [x1]
+  8:   f9000022        str     x2, [x1]
+  c:   f9000003        str     x3, [x0]
+
+  .. swaps the two words at address x0 and x1, of x0 and x1 do not alias.
+  Let's prove this.
+*)
+
+let memory_mc = new_definition `memory_mc = [
+  word 0x02; word 0x00; word 0x40; word 0xf9; // ldr     x2, [x0]
+  word 0x23; word 0x00; word 0x40; word 0xf9; // ldr     x3, [x1]
+  word 0x22; word 0x00; word 0x00; word 0xf9; // str     x2, [x1]
+  word 0x03; word 0x00; word 0x00; word 0xf9  // str     x3, [x0]
+]:((8)word)list`;;
+
+let EXEC = ARM_MK_EXEC_RULE memory_mc;;
+
+let memory_SPEC = prove(
+  `forall pc loc0 loc1 a b.
+  // Assume that loc0 (=x0) and loc1(=x1) do not overlap within 8 bytes.
+  nonoverlapping (word loc0:int64, 8) (word loc1:int64, 8) /\
+  // .. and the writing locations do not overlap with the loaded program.
+  nonoverlapping (word loc0:int64, 8) (word pc:int64, LENGTH memory_mc) /\
+  nonoverlapping (word loc1:int64, 8) (word pc:int64, LENGTH memory_mc)
+  ==> ensures arm
+    // Precondition
+    (\s. aligned_bytes_loaded s (word pc) memory_mc /\
+         read PC s = word pc /\
+         read X0 s = word loc0 /\
+         read X1 s = word loc1 /\
+         read (memory :> bytes64 (word loc0)) s = word a /\
+         read (memory :> bytes64 (word loc1)) s = word b)
+    // Postcondition
+    (\s. read PC s = word (pc + 16) /\
+         read (memory :> bytes64 (word loc0)) s = word b /\
+         read (memory :> bytes64 (word loc1)) s = word a)
+    // Registers (and memory locations) that may change after execution.
+    // ',,' is composition of relations.
+    (MAYCHANGE [PC;X2;X3] ,,
+     // The memory locations may change. Record this.
+     MAYCHANGE [memory :> bytes64 (word loc0); memory :> bytes64 (word loc1)] ,,
+     // Memory instructions raise observable microarchitectural events!
+     MAYCHANGE [events])`,
+
+  (* Convert 'nonoverlapping' into 'nonoverlapping_modulo' and rewrite 'LENGTH memory_mc'
+     with the concrete number. *)
+  REWRITE_TAC[NONOVERLAPPING_CLAUSES;fst EXEC] THEN
+  (* Strips the assumption and outermost universal quantifier from the conclusion of a goal *)
+  REPEAT STRIP_TAC THEN
+
+  (* Let's do symbolic execution until it hits the branch instruction. *)
+  ENSURES_INIT_TAC "s0" THEN
+  ARM_STEPS_TAC EXEC (1--4) THEN
+
+  ENSURES_FINAL_STATE_TAC THEN
+  ASM_REWRITE_TAC[]);;
+
+(* If the written nonoverlapping condition is not sufficient, existing assumptions
+   on memory loads may be erased after simulating store instructions.
+   To print which instructions are erased, set
+     components_print_log := true;;
+*)
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.S
new file mode 100644
index 00000000000..d1bbc0c2766
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.S
@@ -0,0 +1,4 @@
+ldp x11, x10, [x0]
+add x12, x10, #1
+mul x12, x11, x12 // x11 * (x10 + 1)
+str x12, [x1]
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.ml
new file mode 100644
index 00000000000..ef48c23f5c3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.ml
@@ -0,0 +1,198 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+        An example that proves equivalence of two straight-line codes
+                  accessing memory using EQUIV_STEPS_TAC.
+******************************************************************************)
+
+(* Please copy this file to the root directory of
+   s2n-bignum, then follow the instructions. *)
+
+needs "arm/proofs/equiv.ml";;
+
+(* This example will define & prove the equivalence of two programs
+   using EQUIV_STEPS_TAC.
+   This tactic is useful if two programs are supposed to have many
+   equivalent parts. EQUIV_STEPS_TAC receives 'actions', which is an
+   OCaml list stating which lines are equivalent and which lines are diverging.
+   This 'actions' can be generated from, say, syntactic diff of
+   two assembly programs. s2n-bignum also has tools/gen-actions.py
+   which runs the `diff` linux tool on two assembly files. *)
+
+let mc = define_assert_from_elf "mc" "arm/tutorial/rel_equivtac.o" [
+  0xa940280b;       (* arm_LDP X11 X10 X0 (Immediate_Offset (iword (&0))) *)
+  0x9100054c;       (* arm_ADD X12 X10 (rvalue (word 1)) *)
+  0x9b0c7d6c;       (* arm_MUL X12 X11 X12 *)
+  0xf900002c        (* arm_STR X12 X1 (Immediate_Offset (word 0)) *)
+];;
+
+(* Note that the used registers are different between mc and mc2
+   (X10,X11,X12 vs. X20,X21,X22). This is fine since EQUIV_STEPS_TAC
+   can smartly map differently used registers. *)
+let mc2 = define_assert_from_elf "mc2" "arm/tutorial/rel_equivtac2.o" [
+  0xa9405015;       (* arm_LDP X21 X20 X0 (Immediate_Offset (iword (&0))) *)
+  0x9b147eb6;       (* arm_MUL X22 X21 X20 *)
+  0x8b1502d6;       (* arm_ADD X22 X22 X21 *)
+  0xf9000036        (* arm_STR X22 X1 (Immediate_Offset (word 0)) *)
+];;
+
+let EXEC = ARM_MK_EXEC_RULE mc;;
+let EXEC2 = ARM_MK_EXEC_RULE mc2;;
+
+(* Define the equality between the input states. *)
+let eqin = new_definition
+  `forall s1 s1' inbuf outbuf.
+    (eqin:(armstate#armstate)->int64->int64->bool) (s1,s1') inbuf outbuf <=>
+     (// The values of buffer pointers, X0 and X1.
+      // Their values are symbolically defined as inbuf and outbuf.
+      // outbuf is also used for the nonoverlapping precondition between
+      // the output buffer and the program bytecode.
+      read X0 s1 = inbuf /\
+      read X0 s1' = inbuf /\
+      read X1 s1 = outbuf /\
+      read X1 s1' = outbuf /\
+      // The equal buffer contents at the input buffer. '2' stands for 2 words
+      // (and 1 word is 8 bytes, hence 2*8=16 bytes)
+      (exists n.
+        bignum_from_memory (inbuf,2) s1 = n /\
+        bignum_from_memory (inbuf,2) s1' = n))`;;
+
+(* Define the equality between the output states. *)
+let eqout = new_definition
+  `forall s1 s1' outbuf.
+    (eqout:(armstate#armstate)->int64->bool) (s1,s1') outbuf <=>
+     (read X1 s1 = outbuf /\
+      read X1 s1' = outbuf /\
+      (exists n.
+        bignum_from_memory (outbuf,1) s1 = n /\
+        bignum_from_memory (outbuf,1) s1' = n))`;;
+
+(* Now, build the program equivalence statement using
+   'mk_equiv_statement_simple'.
+   Its first argument states the assumption that will appear at
+   LHS of '<assumption> ==> ensures2 ..(equiv statement)..'.
+
+   If it fails, please try `arm_print_log := true`. *)
+let equiv_goal = mk_equiv_statement_simple
+  `ALL (nonoverlapping (outbuf,8)) [
+    (word pc:int64, LENGTH mc);
+    (word pc2:int64, LENGTH mc2)
+  ]`
+  eqin  (* Input state equivalence *)
+  eqout (* Output state equivalence *)
+  mc    (* First program machine code *)
+  `MAYCHANGE [PC; X10; X11; X12] ,, MAYCHANGE [memory :> bytes (outbuf, 8)] ,,
+   MAYCHANGE [events]`
+  mc2   (* Second program machine code *)
+  `MAYCHANGE [PC; X20; X21; X22] ,, MAYCHANGE [memory :> bytes (outbuf, 8)] ,,
+   MAYCHANGE [events]`;;
+
+(* equiv_goal is:
+  `forall pc pc2 inbuf outbuf.
+       ALL (nonoverlapping (outbuf,8))
+       [word pc,LENGTH mc; word pc2,LENGTH mc2]
+       ==> ensures2 arm
+           (\(s,s2).
+                aligned_bytes_loaded s (word pc) mc /\
+                read PC s = word pc /\
+                aligned_bytes_loaded s2 (word pc2) mc2 /\
+                read PC s2 = word pc2 /\
+                eqin (s,s2) inbuf outbuf)
+           (\(s,s2).
+                aligned_bytes_loaded s (word pc) mc /\
+                read PC s = word (pc + 16) /\
+                aligned_bytes_loaded s2 (word pc2) mc2 /\
+                read PC s2 = word (pc2 + 16) /\
+                eqout (s,s2) outbuf)
+           (\(s,s2) (s',s2').
+                (MAYCHANGE [PC; X10; X11; X12] ,,
+                 MAYCHANGE [memory :> bytes (outbuf,8)] ,,
+                 MAYCHANGE [events])
+                s
+                s' /\
+                (MAYCHANGE [PC; X20; X21; X22] ,,
+                 MAYCHANGE [memory :> bytes (outbuf,8)] ,,
+                 MAYCHANGE [events])
+                s2
+                s2')
+           (\s. 4)
+           (\s. 4)`
+*)
+
+(* Now, let's prove the program equivalence. *)
+let EQUIV = prove(equiv_goal,
+
+  (* Rewrite ALL, nonoverlapping, and LENGTH * *)
+  REWRITE_TAC[ALL;NONOVERLAPPING_CLAUSES; fst EXEC; fst EXEC2] THEN
+  REPEAT STRIP_TAC THEN
+
+  (** Initialize **)
+  EQUIV_INITIATE_TAC eqin THEN
+  RULE_ASSUM_TAC (REWRITE_RULE[BIGNUM_FROM_MEMORY_BYTES]) THEN
+
+  (* Do symbolic simulations on the two programs using EQUIV_STEPS_TAC.
+     As explained before, the action is an OCaml list.
+     Each item describes:
+     - ("equal",begin line number of program 1 (start from 0),
+                end line number of program 1 (not inclusive),
+                begin line number of program 2,
+                end line number of program 2)
+       : means that these instructions in program 1 and program 2 must
+         yield sysmbolically equivalent output. Therefore, EQUIV_STEPS_TAC
+         uses a lock-step simulation for these.
+         If the symbolic outputs of the matching instructions are not having
+         equal expression, it will print an error message.
+         Actually, it tries to solve a simple bit-vector equality such as
+           'x * (y + 1) = x * y + x',
+         and can succeed. This is exactly the example case here.
+     - ("replace",beign line number of program 1,
+                  end line number of program 1 (not inclusive),
+                  begin line number of program 2,
+                  end line number of program 2)
+       : means that these instructions in program 1 and 2 differ.
+         EQUIV_STEPS_TAC uses stuttering simulations for each program.
+  *)
+  EQUIV_STEPS_TAC [
+    ("equal",0,1,0,1);
+    ("replace",1,3,1,3);
+    ("equal",3,4,3,4)
+  ] EXEC EXEC2 THEN
+
+  REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN
+  (* Prove remaining clauses from the postcondition *)
+  ASM_REWRITE_TAC[] THEN
+  (* This tactic below is typically fixed and probably you will want to reuse. :) *)
+  CONJ_TAC THENL [
+    (** SUBGOAL 1. Outputs **)
+    ASM_REWRITE_TAC[eqout;
+                    BIGNUM_EXPAND_CONV `bignum_from_memory (outbuf,1) s`] THEN
+    REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]);
+
+    (** SUBGOAL 2. Maychange pair **)
+    MONOTONE_MAYCHANGE_CONJ_TAC
+  ]);;
+
+(*
+    If the EQUIV_STEPS_TAC fails to prove that instructions that are supposed
+    to be equivalent according to actions are yielding equal output expressions,
+    it will print a message like this:
+
+    ARM_LOCKSTEP_TAC (4,4)
+    Running left...
+    Running right...
+    1 basis elements and 0 critical pairs
+            - Error: WORD_RULE could not prove
+              `<word expression (program 2)> = <word expression (program 1)>`
+
+    If you are certain that these expressions must be equal, you can improve
+    `extra_word_CONV` of symbolic simulator by adding a custom word equation
+    to extra_word_CONV.
+
+    ```
+    let org_convs = !extra_word_CONV;;
+    extra_word_CONV := (GEN_REWRITE_CONV I [<your_word_thm>])::org_convs;;
+    ```
+*)
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac2.S
new file mode 100644
index 00000000000..39f772cf54b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac2.S
@@ -0,0 +1,4 @@
+ldp x21, x20, [x0]
+mul x22, x21, x20
+add x22, x22, x21 // x21 * x20 + x21
+str x22, [x1]
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.S
new file mode 100644
index 00000000000..46aac446e30
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.S
@@ -0,0 +1,6 @@
+loop:
+add x2, x2, #2
+add x0, x0, #1
+cmp x0, x1
+bne loop
+
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.ml
new file mode 100644
index 00000000000..1c99d9b3afc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.ml
@@ -0,0 +1,151 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+              An example that proves equivalence of two loops.
+******************************************************************************)
+
+needs "arm/proofs/equiv.ml";;
+
+(* Prove that these two loops are equivalent in the sense that the results of
+   two X2 are same. *)
+
+let loop_mc = define_assert_from_elf "loop_mc" "arm/tutorial/rel_loop.o" [
+  0x91000842;       (* arm_ADD X2 X2 (rvalue (word 2)) *)
+  0x91000400;       (* arm_ADD X0 X0 (rvalue (word 1)) *)
+  0xeb01001f;       (* arm_CMP X0 X1 *)
+  0x54ffffa1        (* arm_BNE (word 2097140) *)
+];;
+
+let loop2_mc = define_assert_from_elf "loop2_mc" "arm/tutorial/rel_loop2.o" [
+  0x91000442;       (* arm_ADD X2 X2 (rvalue (word 1)) *)
+  0x91000442;       (* arm_ADD X2 X2 (rvalue (word 1)) *)
+  0x91000400;       (* arm_ADD X0 X0 (rvalue (word 1)) *)
+  0xeb01001f;       (* arm_CMP X0 X1 *)
+  0x54ffff81        (* arm_BNE (word 2097136) *)
+];;
+
+let LOOP_EXEC = ARM_MK_EXEC_RULE loop_mc;;
+let LOOP2_EXEC = ARM_MK_EXEC_RULE loop2_mc;;
+
+(* For relational reasoning, we use predicates and tactics that are slightly
+   different from those for unary reasoning. *)
+
+let LOOP_EQUIV = prove(
+  `forall pc1 pc2 n.
+    n > 0 /\ n < 2 EXP 64 ==>
+    // Relational hoare triple.
+    ensures2 arm
+      // Precondition
+      (\(s1,s2). aligned_bytes_loaded s1 (word pc1) loop_mc /\
+                  read PC s1 = word pc1 /\
+                  aligned_bytes_loaded s2 (word pc2) loop2_mc /\
+                  read PC s2 = word pc2 /\
+                  // X0 is the induction variable and X1 is n.
+                  (read X0 s1 = word 0 /\ read X0 s2 = word 0 /\
+                  read X1 s1 = word n /\ read X1 s2 = word n /\
+                  // X2 must start equal.
+                  (?k. read X2 s1 = k /\ read X2 s2 = k)))
+      // Postcondition
+      (\(s1,s2). aligned_bytes_loaded s1 (word pc1) loop_mc /\
+                  read PC s1 = word (pc1 + 12) /\
+                  aligned_bytes_loaded s2 (word pc2) loop2_mc /\
+                  read PC s2 = word (pc2 + 16) /\
+                  // They finish with an equal value.
+                  (?k. read X2 s1 = k /\ read X2 s2 = k))
+      // State components that may change.
+      (\(s1,s2) (s1',s2').
+        (MAYCHANGE [PC;X0;X2] ,, MAYCHANGE SOME_FLAGS ,, MAYCHANGE [events]) s1 s1' /\
+        (MAYCHANGE [PC;X0;X2] ,, MAYCHANGE SOME_FLAGS ,, MAYCHANGE [events]) s2 s2')
+      // The number of small steps of the 'left' program and 'right' program.
+      (\s. 4 * n - 1) (\s. 5 * n - 1)`,
+
+  REPEAT STRIP_TAC THEN REWRITE_TAC[SOME_FLAGS] THEN
+  (* Look at the definition of ENSURES2_WHILE_PAUP_TAC in arm/proofs/equiv.ml
+     to understand the meanings of arguments. *)
+  ENSURES2_WHILE_PAUP_TAC `0:num` `n:num` `pc1:num` `pc1+12` `pc2:num` `pc2+16`
+    `\(i:num) s1 s2.
+        read X0 s1 = word i /\ read X0 s2 = word i /\
+        read X1 s1 = word n /\ read X1 s2 = word n /\
+        (?k. read X2 s1 = k /\ read X2 s2 = k)`
+    `\(i:num) s. read ZF s <=> (word i:int64) = word n`
+    `\(i:num) s. read ZF s <=> (word i:int64) = word n`
+    `\(i:num). 3`
+    `\(i:num). 4`
+    `0` `0` `0` `0` `1` `1` THEN
+  REPEAT CONJ_TAC THENL [
+    (* # loop itrs > 0 *)
+    ASM_ARITH_TAC;
+
+    (* pre *)
+    MATCH_MP_TAC ENSURES2_TRIVIAL THEN
+    REWRITE_TAC[FORALL_PAIR_THM] THEN
+    REPEAT GEN_TAC THEN
+    MONOTONE_MAYCHANGE_CONJ_TAC;
+
+    (* now the main loop! *)
+    REPEAT STRIP_TAC THEN
+    (* Start symbolic execution of two programs. *)
+    ENSURES2_INIT_TAC "s0" "s0'" THEN
+
+    FIRST_X_ASSUM (MP_TAC o (check (is_exists o concl))) THEN
+    STRIP_TAC THEN
+    REWRITE_TAC[GSYM CONJ_ASSOC] THEN
+
+    (* Symbolically execute the left program only. *)
+    ARM_N_STUTTER_LEFT_TAC LOOP_EXEC (1--3) None THEN
+    (* Symbolically execute the right program only. *)
+    ARM_N_STUTTER_RIGHT_TAC LOOP2_EXEC (1--4) "'" None THEN
+    (* Let's prove the postcondition. *)
+    REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN
+    ASM_REWRITE_TAC[WORD_ADD] THEN
+
+    CONJ_TAC THENL [
+      CONJ_TAC THENL [
+        META_EXISTS_TAC THEN UNIFY_REFL_TAC;
+        REWRITE_TAC[VAL_EQ_0] THEN CONV_TAC WORD_RULE;
+      ];
+
+      MONOTONE_MAYCHANGE_CONJ_TAC
+    ];
+
+    (* backedge *)
+    REPEAT STRIP_TAC THEN
+    ENSURES2_INIT_TAC "s0" "s0'" THEN
+    UNDISCH_TAC `?k. read X2 s0 = k /\ read X2 s0' = k` THEN
+    STRIP_TAC THEN
+    REWRITE_TAC[GSYM CONJ_ASSOC] THEN
+
+    ARM_N_STUTTER_LEFT_TAC LOOP_EXEC (1--1) None THEN
+    ARM_N_STUTTER_RIGHT_TAC LOOP2_EXEC (1--1) "'" None THEN
+    REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN
+    ASM_REWRITE_TAC[WORD_ADD] THEN
+
+    SUBGOAL_THEN `(word i:int64 = word n) <=> F` SUBST_ALL_TAC THENL [
+      REWRITE_TAC[WORD_EQ;CONG;DIMINDEX_64] THEN
+      IMP_REWRITE_TAC[MOD_LT] THEN ASM_ARITH_TAC;
+
+      ALL_TAC
+    ] THEN
+    REWRITE_TAC[] THEN
+    CONJ_TAC THENL [
+
+      META_EXISTS_TAC THEN UNIFY_REFL_TAC;
+
+      MONOTONE_MAYCHANGE_CONJ_TAC
+    ];
+
+    (* postcond *)
+    MATCH_MP_TAC ENSURES2_TRIVIAL THEN
+    REWRITE_TAC[FORALL_PAIR_THM] THEN
+    CONJ_TAC THENL [MESON_TAC[]; ALL_TAC] THEN
+    REPEAT GEN_TAC THEN MONOTONE_MAYCHANGE_CONJ_TAC;
+
+    (* counter 1 *)
+    REWRITE_TAC[NSUM_CONST_NUMSEG] THEN ASM_ARITH_TAC;
+
+    (* counter 2 *)
+    REWRITE_TAC[NSUM_CONST_NUMSEG] THEN ASM_ARITH_TAC;
+  ]);;
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop2.S
new file mode 100644
index 00000000000..60e78c5354d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop2.S
@@ -0,0 +1,7 @@
+loop:
+add x2, x2, #1
+add x2, x2, #1
+add x0, x0, #1
+cmp x0, x1
+bne loop
+
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.S
new file mode 100644
index 00000000000..ff10c42eef8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.S
@@ -0,0 +1,6 @@
+ldr x10, [x0]
+add x10, x10, #1
+str x10, [x1]
+ldr x10, [x0, #8]
+add x10, x10, #2
+str x10, [x1, #8]
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.ml
new file mode 100644
index 00000000000..84fcba1fbdb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.ml
@@ -0,0 +1,187 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+        An example that proves equivalence of two straight-line codes
+                      whose instructions are shuffled.
+******************************************************************************)
+
+(* Please copy this file to the root directory of
+   s2n-bignum, then follow the instructions. *)
+
+needs "arm/proofs/equiv.ml";;
+
+(* This example will define & prove the equivalence of two programs
+   whose instructions are reordered, using ARM_N_STEPS_AND_ABBREV_TAC and
+   ARM_N_STEPS_AND_REWRITE_TAC.
+   These tactics receive the mapping between the lines of instructions
+   of the two programs (which is an OCaml integer list).
+   ARM_N_STEPS_AND_ABBREV_TAC symbolically simulates the "left" program,
+   introduces abbreviations of the output symbolic expressions of each
+   instruction, and stores it to an OCaml reference variable.
+   ARM_N_STEPS_AND_REWRITE_TAC symbolically simulates the "right" program,
+   finds the right abbreviation expression according to the line-number
+   mapping information, and rewrites the output expressions using the matched
+   abbreviation. *)
+
+let mc = define_assert_from_elf "mc" "arm/tutorial/rel_reordertac.o" [
+  0xf940000a;       (* arm_LDR X10 X0 (Immediate_Offset (word 0)) *)
+  0x9100054a;       (* arm_ADD X10 X10 (rvalue (word 1)) *)
+  0xf900002a;       (* arm_STR X10 X1 (Immediate_Offset (word 0)) *)
+  0xf940040a;       (* arm_LDR X10 X0 (Immediate_Offset (word 8)) *)
+  0x9100094a;       (* arm_ADD X10 X10 (rvalue (word 2)) *)
+  0xf900042a        (* arm_STR X10 X1 (Immediate_Offset (word 8)) *)
+];;
+
+(* Note that the used registers are different between mc and mc2
+   (X10 vs. X10,X11). This is fine since the tactics can smartly
+   map the registers.
+   Also, this reordering is correct only of [X0, X0+16) is disjoint with
+   [X1, X1+16). We will have this as an assumption in the equivalence
+   goal. *)
+let mc2 = define_assert_from_elf "mc2" "arm/tutorial/rel_reordertac2.o" [
+  0xf940000a;       (* arm_LDR X10 X0 (Immediate_Offset (word 0)) *)
+  0xf940040b;       (* arm_LDR X11 X0 (Immediate_Offset (word 8)) *)
+  0x9100054a;       (* arm_ADD X10 X10 (rvalue (word 1)) *)
+  0x9100096b;       (* arm_ADD X11 X11 (rvalue (word 2)) *)
+  0xf900002a;       (* arm_STR X10 X1 (Immediate_Offset (word 0)) *)
+  0xf900042b        (* arm_STR X11 X1 (Immediate_Offset (word 8)) *)
+];;
+
+let EXEC = ARM_MK_EXEC_RULE mc;;
+let EXEC2 = ARM_MK_EXEC_RULE mc2;;
+
+(* Define the equality between the input states. *)
+let eqin = new_definition
+  `forall s1 s1' inbuf outbuf.
+    (eqin:(armstate#armstate)->int64->int64->bool) (s1,s1') inbuf outbuf <=>
+     (// The values of buffer pointers, X0 and X1.
+      // Their values are symbolically defined as inbuf and outbuf.
+      // outbuf is also used for the nonoverlapping precondition between
+      // the output buffer and the program bytecode.
+      read X0 s1 = inbuf /\
+      read X0 s1' = inbuf /\
+      read X1 s1 = outbuf /\
+      read X1 s1' = outbuf /\
+      // The equal buffer contents at the input buffer. '2' stands for 2 words
+      // (and 1 word is 8 bytes, hence 2*8=16 bytes)
+      (exists n.
+        bignum_from_memory (inbuf,2) s1 = n /\
+        bignum_from_memory (inbuf,2) s1' = n))`;;
+
+(* Define the equality between the output states. *)
+let eqout = new_definition
+  `forall s1 s1' inbuf outbuf.
+    (eqout:(armstate#armstate)->int64->int64->bool) (s1,s1') inbuf outbuf <=>
+     (read X0 s1 = inbuf /\
+      read X0 s1' = inbuf /\
+      read X1 s1 = outbuf /\
+      read X1 s1' = outbuf /\
+      (exists n.
+        bignum_from_memory (inbuf,2) s1 = n /\
+        bignum_from_memory (inbuf,2) s1' = n) /\
+      (exists n.
+        bignum_from_memory (outbuf,2) s1 = n /\
+        bignum_from_memory (outbuf,2) s1' = n))`;;
+
+(* Now, build the program equivalence statement using
+   'mk_equiv_statement_simple'.
+   Its first argument states the assumption that will appear at
+   LHS of '<assumption> ==> ensures2 ..(equiv statement)..'.
+
+   If it fails, please try `arm_print_log := true`. *)
+let equiv_goal = mk_equiv_statement_simple
+  `ALL (nonoverlapping (outbuf,16)) [
+    (word pc:int64, LENGTH mc);
+    (word pc2:int64, LENGTH mc2);
+    (inbuf:int64, 16)
+  ]`
+  eqin  (* Input state equivalence *)
+  eqout (* Output state equivalence *)
+  mc    (* First program machine code *)
+  `MAYCHANGE [PC; X10] ,, MAYCHANGE [memory :> bytes (outbuf, 16)] ,, MAYCHANGE [events]`
+  mc2   (* Second program machine code *)
+  `MAYCHANGE [PC; X10; X11] ,, MAYCHANGE [memory :> bytes (outbuf, 16)] ,, MAYCHANGE [events]`;;
+
+(* equiv_goal is:
+  `forall pc pc2 inbuf outbuf.
+       ALL (nonoverlapping (outbuf,16))
+       [word pc,LENGTH mc; word pc2,LENGTH mc2; inbuf,16]
+       ==> ensures2 arm
+           (\(s,s2).
+                aligned_bytes_loaded s (word pc) mc /\
+                read PC s = word pc /\
+                aligned_bytes_loaded s2 (word pc2) mc2 /\
+                read PC s2 = word pc2 /\
+                eqin (s,s2) inbuf outbuf)
+           (\(s,s2).
+                aligned_bytes_loaded s (word pc) mc /\
+                read PC s = word (pc + 24) /\
+                aligned_bytes_loaded s2 (word pc2) mc2 /\
+                read PC s2 = word (pc2 + 24) /\
+                eqout (s,s2) inbuf outbuf)
+           (\(s,s2) (s',s2').
+                (MAYCHANGE [PC; X10] ,,
+                 MAYCHANGE [memory :> bytes (outbuf,16)] ,,
+                 MAYCHANGE [events])
+                s
+                s' /\
+                (MAYCHANGE [PC; X10; X11] ,,
+                 MAYCHANGE [memory :> bytes (outbuf,16)] ,,
+                 MAYCHANGE [events])
+                s2
+                s2')
+           (\s. 6)
+           (\s. 6)`
+*)
+
+(* Line numbers from the second program (mc2) to the first program (mc1). *)
+let inst_map = [1;4;2;5;3;6];;
+
+(* (state number, (equation, fresh var)) *)
+let state_to_abbrevs: (int * thm) list ref = ref [];;
+
+(* Now, let's prove the program equivalence. *)
+let EQUIV = prove(equiv_goal,
+
+  (* Rewrite ALL, nonoverlapping, and LENGTH * *)
+  REWRITE_TAC[ALL;NONOVERLAPPING_CLAUSES; fst EXEC; fst EXEC2] THEN
+  REPEAT STRIP_TAC THEN
+
+  (** Initialize **)
+  EQUIV_INITIATE_TAC eqin THEN
+  RULE_ASSUM_TAC (REWRITE_RULE[BIGNUM_FROM_MEMORY_BYTES]) THEN
+
+  (* Left *)
+  ARM_N_STEPS_AND_ABBREV_TAC EXEC  (1--(List.length inst_map))
+      state_to_abbrevs None THEN
+
+  (* Right *)
+  ARM_N_STEPS_AND_REWRITE_TAC EXEC2 (1--(List.length inst_map))
+      inst_map state_to_abbrevs None THEN
+
+  (* Running the statements above step by step will raise an error
+     message saying that the tactic is not VALID. You can temporarily
+     disable the message by redefining 'e' as follows:
+
+     let e tac = refine(by(tac));;
+
+     The whole proof ("prove(...)") will still run okay.
+  *)
+
+  REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN
+  (* Prove remaining clauses from the postcondition *)
+  ASM_REWRITE_TAC[] THEN
+  (* This tactic below is typically fixed and probably you will want to reuse. :) *)
+  CONJ_TAC THENL [
+    (** SUBGOAL 1. Outputs **)
+    ASM_REWRITE_TAC[eqout;
+                    BIGNUM_EXPAND_CONV `bignum_from_memory (outbuf,2) s`] THEN
+    REPEAT CONJ_TAC THEN
+    REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]);
+
+    (** SUBGOAL 2. Maychange pair **)
+    MONOTONE_MAYCHANGE_CONJ_TAC
+  ]);;
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac2.S
new file mode 100644
index 00000000000..a58c3e01ee7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac2.S
@@ -0,0 +1,6 @@
+ldr x10, [x0]
+ldr x11, [x0, #8]
+add x10, x10, #1
+add x11, x11, #2
+str x10, [x1]
+str x11, [x1, #8]
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.S
new file mode 100644
index 00000000000..1d1d8f2b40c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.S
@@ -0,0 +1,3 @@
+add x0, x0, #1
+add x1, x1, #2
+add x0, x0, #3
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.ml
new file mode 100644
index 00000000000..68083f0e302
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.ml
@@ -0,0 +1,95 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+        An example that proves equivalence of two straight-line codes.
+******************************************************************************)
+
+(* Please copy this file to the root directory of
+   s2n-bignum, then follow the instructions. *)
+
+needs "arm/proofs/equiv.ml";;
+
+(* Prove that given x0 and x1 are equal, their final results are also equal. *)
+
+let simp_mc = define_assert_from_elf "simp_mc" "arm/tutorial/rel_simp.o" [
+  0x91000400;       (* arm_ADD X0 X0 (rvalue (word 1)) *)
+  0x91000821;       (* arm_ADD X1 X1 (rvalue (word 2)) *)
+  0x91000c00        (* arm_ADD X0 X0 (rvalue (word 3)) *)
+];;
+
+let simp2_mc = define_assert_from_elf "simp2_mc" "arm/tutorial/rel_simp2.o" [
+  0x91001000;       (* arm_ADD X0 X0 (rvalue (word 4)) *)
+  0x91000821        (* arm_ADD X1 X1 (rvalue (word 2)) *)
+];;
+
+let SIMP_EXEC = ARM_MK_EXEC_RULE simp_mc;;
+let SIMP2_EXEC = ARM_MK_EXEC_RULE simp2_mc;;
+
+(* For relational reasoning, we use predicates and tactics that are slightly
+   different from those for unary reasoning. *)
+
+let SIMP_EQUIV = prove(
+  `forall pc1 pc2 a b.
+    // Relational hoare triple.
+    ensures2 arm
+      // Precondition
+      (\(s1,s2). aligned_bytes_loaded s1 (word pc1) simp_mc /\
+                  read PC s1 = word pc1 /\
+                  aligned_bytes_loaded s2 (word pc2) simp2_mc /\
+                  read PC s2 = word pc2 /\
+                  // X0 and X1 start equal.
+                  read X0 s1 = a /\ read X0 s2 = a /\
+                  read X1 s1 = b /\ read X1 s2 = b)
+      // Postcondition
+      (\(s1,s2). aligned_bytes_loaded s1 (word pc1) simp_mc /\
+                  read PC s1 = word (pc1 + 12) /\
+                  aligned_bytes_loaded s2 (word pc2) simp2_mc /\
+                  read PC s2 = word (pc2 + 8) /\
+                  // They finish with an equal value.
+                  (?k. read X0 s1 = k /\ read X0 s2 = k) /\
+                  (?k2. read X1 s1 = k2 /\ read X1 s2 = k2))
+      // State components that may change.
+      (\(s1,s2) (s1',s2').
+        // PC,X0,X1 may change in the left program
+        MAYCHANGE [PC;X0;X1] s1 s1' /\
+        // .. and in the right program as well.
+        MAYCHANGE [PC;X0;X1] s2 s2')
+      // The number of small steps of the 'left' program and 'right' program.
+      // 'ensures2' needs the number of small steps taken to reach at the
+      // postcondition. Similarly, 'ensures_n' is a unary predicate similar to
+      // 'ensures' but takes the number of steps too. 'ensures_n' will not
+      // appear in this example.
+      (\s. 3) (\s. 2)`,
+
+  REPEAT STRIP_TAC THEN
+  (* Start symbolic execution of the two programs! The left program's initial
+     state is named as s0, and the right is s0'. *)
+  ENSURES2_INIT_TAC "s0" "s0'" THEN
+
+  (* Symbolically execute the left program only. *)
+  ARM_N_STUTTER_LEFT_TAC SIMP_EXEC (1--3) None THEN
+  (* Symbolically execute the right program only. "'" is the suffix of the
+     state name. *)
+  ARM_N_STUTTER_RIGHT_TAC SIMP2_EXEC (1--2) "'" None THEN
+
+  (* Let's prove the postcondition. *)
+  REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN
+  ASM_REWRITE_TAC[] THEN
+
+  CONJ_TAC THENL [
+    (* ((?k. word_add a (word 4) = k)
+       Actually, simplification procedure in symbolic execution tactic already
+       folded 'word_add (word_add a (word 1)) (word 3)' into
+       'word_add a (word 4)'. *)
+    (* META_EXISTS_TAC is somewhat similar to eexists in Coq. *)
+    CONJ_TAC THENL [
+      META_EXISTS_TAC THEN UNIFY_REFL_TAC;
+      META_EXISTS_TAC THEN UNIFY_REFL_TAC;
+    ];
+
+    (* Maychange pair *)
+    MONOTONE_MAYCHANGE_CONJ_TAC
+  ]);;
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp2.S
new file mode 100644
index 00000000000..f8d7753cc04
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp2.S
@@ -0,0 +1,2 @@
+add x0, x0, #4
+add x1, x1, #2
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.S
new file mode 100644
index 00000000000..e60e6235da0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.S
@@ -0,0 +1,28 @@
+#define a0 x10
+#define a1 x11
+#define res0 x20
+#define res1 x21
+#define res2 x22
+#define res3 x23
+#define res1t x27
+#define res2t x28
+#define res3t x29
+#define a0a0_hi x12
+#define a0a1_lo x13
+#define a0a1_hi x14
+#define a1a1_lo x15
+#define a1a1_hi x16
+
+ldp a0, a1, [x1]
+mul   res0,    a0, a0
+umulh a0a0_hi, a0, a0
+mul   a0a1_lo, a0, a1
+umulh a0a1_hi, a0, a1
+mul   a1a1_lo, a1, a1
+umulh a1a1_hi, a1, a1
+adds  res1t,   a0a0_hi, a0a1_lo
+adcs  res2t,   a1a1_lo, a0a1_hi
+adc   res3t,   a1a1_hi, xzr
+adds  res1,    res1t,   a0a1_lo
+adcs  res2,    res2t,   a0a1_hi
+adc   res3,    res3t,   xzr
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.ml
new file mode 100644
index 00000000000..287eada88a9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.ml
@@ -0,0 +1,148 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+              An example that proves the equivalence of
+              two implementations of 128x128->256-bit squaring.
+******************************************************************************)
+
+needs "arm/proofs/equiv.ml";;
+(* neon_helper.ml has lemmas and tactics that are useful to prove programs
+   manipulating SIMD registers. *)
+needs "arm/proofs/neon_helper.ml";;
+
+(* This is a realistic (and a bit 'dirty') example that shows how equivalence
+   of vectorization is proven using relational reasoning.
+   It is always welcome to clean this proof further. *)
+
+let veceq_mc = define_assert_from_elf "veceq_mc" "arm/tutorial/rel_veceq.o" [
+  0xa9402c2a;       (* arm_LDP X10 X11 X1 (Immediate_Offset (iword (&0))) *)
+  0x9b0a7d54;       (* arm_MUL X20 X10 X10 *)
+  0x9bca7d4c;       (* arm_UMULH X12 X10 X10 *)
+  0x9b0b7d4d;       (* arm_MUL X13 X10 X11 *)
+  0x9bcb7d4e;       (* arm_UMULH X14 X10 X11 *)
+  0x9b0b7d6f;       (* arm_MUL X15 X11 X11 *)
+  0x9bcb7d70;       (* arm_UMULH X16 X11 X11 *)
+  0xab0d019b;       (* arm_ADDS X27 X12 X13 *)
+  0xba0e01fc;       (* arm_ADCS X28 X15 X14 *)
+  0x9a1f021d;       (* arm_ADC X29 X16 XZR *)
+  0xab0d0375;       (* arm_ADDS X21 X27 X13 *)
+  0xba0e0396;       (* arm_ADCS X22 X28 X14 *)
+  0x9a1f03b7        (* arm_ADC X23 X29 XZR *)
+];;
+
+let VECEQ_EXEC = ARM_MK_EXEC_RULE veceq_mc;;
+
+let veceq2_mc = define_assert_from_elf "veceq2_mc" "arm/tutorial/rel_veceq2.o" [
+  0xa9403c29;       (* arm_LDP X9 X15 X1 (Immediate_Offset (iword (&0))) *)
+  0x3dc0003e;       (* arm_LDR Q30 X1 (Immediate_Offset (word 0)) *)
+  0x2ebec3c0;       (* arm_UMULL_VEC Q0 Q30 Q30 32 *)
+  0x6ebec3c2;       (* arm_UMULL2_VEC Q2 Q30 Q30 32 *)
+  0x0ea12bd8;       (* arm_XTN Q24 Q30 32 *)
+  0x4e9e5bde;       (* arm_UZP2 Q30 Q30 Q30 32 *)
+  0x2eb8c3de;       (* arm_UMULL_VEC Q30 Q30 Q24 32 *)
+  0x4e083c07;       (* arm_UMOV X7 Q0 0 8 *)
+  0x4e183c0e;       (* arm_UMOV X14 Q0 1 8 *)
+  0x4e083c53;       (* arm_UMOV X19 Q2 0 8 *)
+  0x4e183c56;       (* arm_UMOV X22 Q2 1 8 *)
+  0x4e083fc4;       (* arm_UMOV X4 Q30 0 8 *)
+  0x4e183fcc;       (* arm_UMOV X12 Q30 1 8 *)
+  0xab0484f5;       (* arm_ADDS X21 X7 (Shiftedreg X4 LSL 33) *)
+  0xd35ffc84;       (* arm_LSR X4 X4 31 *)
+  0x9a0401ce;       (* arm_ADC X14 X14 X4 *)
+  0xab0c8673;       (* arm_ADDS X19 X19 (Shiftedreg X12 LSL 33) *)
+  0xd35ffd84;       (* arm_LSR X4 X12 31 *)
+  0x9a0402d6;       (* arm_ADC X22 X22 X4 *)
+  0x9b0f7d24;       (* arm_MUL X4 X9 X15 *)
+  0x9bcf7d2c;       (* arm_UMULH X12 X9 X15 *)
+  0xab0405d8;       (* arm_ADDS X24 X14 (Shiftedreg X4 LSL 1) *)
+  0x93c4fd84;       (* arm_EXTR X4 X12 X4 63 *)
+  0xba040273;       (* arm_ADCS X19 X19 X4 *)
+  0xd37ffd84;       (* arm_LSR X4 X12 63 *)
+  0x9a0402c4        (* arm_ADC X4 X22 X4 *)
+];;
+
+let VECEQ2_EXEC = ARM_MK_EXEC_RULE veceq2_mc;;
+
+
+(* Define the equivalence of input states and output states. *)
+
+let equiv_input_states = new_definition
+  `forall s1 s1' x.
+    (equiv_input_states:(armstate#armstate)->int64->bool) (s1,s1') x <=>
+    (read X1 s1 = x /\ read X1 s1' = x /\
+     exists a. bignum_from_memory (x,2) s1 = a /\
+         bignum_from_memory (x,2) s1' = a)`;;
+
+let equiv_output_states = new_definition
+  `forall s1 s1'.
+    (equiv_output_states:(armstate#armstate)->bool) (s1,s1') <=>
+    (exists a. read X20 s1 = a /\ read X21 s1' = a /\
+     (exists b. read X21 s1 = b /\ read X24 s1' = b /\
+      (exists c. read X22 s1 = c /\ read X19 s1' = c /\
+       (exists d. read X23 s1 = d /\ read X4 s1' = d))))`;;
+
+
+(* Define the equivalence statement which is ensures2 predicate.
+   Please look at the definition of mk_equiv_statement_simple for full
+   definitions of its parameters. *)
+
+let equiv_goal1 = mk_equiv_statement_simple
+    `T` (* assumption such as nonoverlapping; nothing here, so simply T. *)
+    equiv_input_states
+    equiv_output_states
+    veceq_mc
+    `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,,
+     MAYCHANGE [X20;X21;X22;X23;X27;X28;X29]`
+    veceq2_mc
+    `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,,
+     MAYCHANGE [X19;X21;X22;X24]`;;
+
+(* 'actions' is a list of line diffs between the two assembly files (textual
+    form). This isn't important in this example, but to understand when
+    it is useful you might want to look at proofs in s2n-bignum that use
+    EQUIV_STEPS_TAC. *)
+let actions = [
+  ("replace",0,13,0,26)
+];;
+
+(* After every small step, simplify the symbolic expression using
+   a new custom rewrite rule that is WORD_BITMANIP_SIMP_LEMMAS. *)
+extra_word_CONV :=
+  [GEN_REWRITE_CONV I [WORD_BITMANIP_SIMP_LEMMAS]]
+  @ (!extra_word_CONV);;
+
+
+let VECTORIZE_SQR_EQUIV = prove(equiv_goal1,
+
+  REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;
+    fst VECEQ_EXEC; fst VECEQ2_EXEC] THEN
+  REPEAT STRIP_TAC THEN
+  (** Initialize **)
+  EQUIV_INITIATE_TAC equiv_input_states THEN
+  RULE_ASSUM_TAC (REWRITE_RULE[BIGNUM_FROM_MEMORY_BYTES]) THEN
+
+  (* Symbolically simulate each program, to the last instructions. *)
+  EQUIV_STEPS_TAC actions VECEQ_EXEC VECEQ2_EXEC THEN
+
+  (* For some reason, using this additional RULE_ASSUME_TAC was necessary...
+     Adding these rules to extra_word_CONV didn't work. Yes, this is a 'dirty'
+     part of the current status (= manual rewrites are sometimes necessary).
+     Also, these rewrite rules (WORD_SQR128_DIGIT0, ...) are not succinct.
+     Would be great if their proofs are shorter at least. *)
+  RULE_ASSUM_TAC (REWRITE_RULE[WORD_SQR128_DIGIT0;
+                       WORD_SQR128_DIGIT1;WORD_SQR128_DIGIT2;
+                       WORD_SQR128_DIGIT3]) THEN
+
+  REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN
+  (* Prove remaining clauses from the postcondition *)
+  ASM_REWRITE_TAC[] THEN
+  CONJ_TAC THENL [
+    (* Prove the equivalence! *)
+    ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs] THEN
+    REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]);
+
+    MONOTONE_MAYCHANGE_CONJ_TAC
+  ]);;
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq2.S
new file mode 100644
index 00000000000..521374de054
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq2.S
@@ -0,0 +1,26 @@
+ldp x9, x15, [x1]
+ldr q30, [x1]
+umull v0.2D, v30.2S, v30.2S
+umull2 v2.2D, v30.4S, v30.4S
+xtn v24.2S, v30.2D
+uzp2 v30.4S, v30.4S, v30.4S
+umull v30.2D, v30.2S, v24.2S
+mov x7, v0.d[0]
+mov x14, v0.d[1]
+mov x19, v2.d[0]
+mov x22, v2.d[1]
+mov x4, v30.d[0]
+mov x12, v30.d[1]
+adds x21, x7, x4, lsl #33
+lsr x4, x4, #31
+adc x14, x14, x4
+adds x19, x19, x12, lsl #33
+lsr x4, x12, #31
+adc x22, x22, x4
+mul x4, x9, x15
+umulh x12, x9, x15
+adds x24, x14, x4, lsl #1
+extr x4, x12, x4, #63
+adcs x19, x19, x4
+lsr x4, x12, #63
+adc x4, x22, x4 //x21,x24,x19,x4
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.S
new file mode 100644
index 00000000000..e0554712153
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.S
@@ -0,0 +1,109 @@
+/*
+  This assembly file is a cleaned (and less ABI-compliant) version of GCC
+  output of the following
+  C program:
+
+  const int x[10] = {2, 4, 6, 8, 10, 12, 14, 16, 18, 20};
+  const int y[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  const int z = 1;
+
+  int f(uint64_t i) {
+    return x[i] + y[i];
+  }
+
+  int g(int64_t i) {
+    return f(i + z);
+  }
+*/
+
+#if defined(__linux__) && defined(__ELF__)
+.section  .rodata
+  .global  x
+  .type  x, %object
+  .size  x, 40
+#elif defined(__APPLE__)
+.const_data
+#endif
+  .align  3
+x:
+  .word  2
+  .word  4
+  .word  6
+  .word  8
+  .word  10
+  .word  12
+  .word  14
+  .word  16
+  .word  18
+  .word  20
+
+#if defined(__linux__) && defined(__ELF__)
+  .global  y
+  .type  y, %object
+  .size  y, 40
+#endif
+  .align  3
+y:
+  .word  1
+  .word  2
+  .word  3
+  .word  4
+  .word  5
+  .word  6
+  .word  7
+  .word  8
+  .word  9
+  .word  10
+
+#if defined(__linux__) && defined(__ELF__)
+  .global  z
+  .type  z, %object
+  .size  z, 4
+#endif
+  .align  3
+z:
+  .word  1
+
+.text
+  .align  2
+#if defined(__linux__) && defined(__ELF__)
+  .type  f, %function
+#endif
+
+f:
+  mov x3, x0
+#if defined(__linux__) && defined(__ELF__)
+  adrp  x10, x
+  add  x10, x10, :lo12:x
+#else
+  adrp  x10, x@PAGE
+  add  x10, x10, x@PAGEOFF
+#endif
+  mov x1, x3
+  ldr  w1, [x10, x1, lsl 2]
+#if defined(__linux__) && defined(__ELF__)
+  adrp  x11, y
+  add  x11, x11, :lo12:y
+#else
+  adrp  x11, y@PAGE
+  add  x11, x11, y@PAGEOFF
+#endif
+  mov x2, x3
+  ldr  w0, [x11, x2, lsl 2]
+  add  w0, w1, w0
+  ret
+
+#if defined(__linux__) && defined(__ELF__)
+  .type  g, %function
+#endif
+g:
+#if defined(__linux__) && defined(__ELF__)
+  adrp  x10, z
+  add  x10, x10, :lo12:z
+#else
+  adrp  x10, z@PAGE
+  add  x10, x10, z@PAGEOFF
+#endif
+  ldr w1, [x10]
+  add x0, x1, x0
+  b f
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.ml
new file mode 100644
index 00000000000..65d92cf6da5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.ml
@@ -0,0 +1,232 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+          Verifying a program that reads constant data from .rodata
+******************************************************************************)
+
+needs "arm/proofs/base.ml";;
+
+(* The following command will print the assertion checker fn of
+   "arm/tutorial/rodata.o":
+
+   print_literal_relocs_from_elf "arm/tutorial/rodata.o";;
+
+   Or, you can also use
+
+   save_literal_relocs_from_elf "out.txt" "arm/tutorial/rodata.o";;
+*)
+
+let a_mc,a_constants_data = define_assert_relocs_from_elf "a_mc"
+    "arm/tutorial/rodata.o"
+(fun w BL ADR ADRP ADD_rri64 -> [
+(* int f(int) *)
+  w 0xaa0003e3;         (* arm_MOV X3 X0 *)
+
+  (* NOTE: The two entries below have the names of symbols. If they appear as
+     an empty string on your custom object file, please check whether the
+     symbols are defined as global in assembly. Local symbols will not have
+     their names recorded in string table. *)
+  ADRP (mk_var("x",`:num`),0,4,10);
+  ADD_rri64 (mk_var("x",`:num`),0,10,10);
+
+  w 0xaa0303e1;         (* arm_MOV X1 X3 *)
+  w 0xb8617941;         (* arm_LDR W1 X10 (Shiftreg_Offset X1 2) *)
+
+  ADRP (mk_var("y",`:num`),0,20,11);
+  ADD_rri64 (mk_var("y",`:num`),0,11,11);
+
+  w 0xaa0303e2;         (* arm_MOV X2 X3 *)
+  w 0xb8627960;         (* arm_LDR W0 X11 (Shiftreg_Offset X2 2) *)
+  w 0x0b000020;         (* arm_ADD W0 W1 W0 *)
+  w 0xd65f03c0;         (* arm_RET X30 *)
+
+(* int g(int) *)
+  ADRP (mk_var("z",`:num`),0,44,10);
+  ADD_rri64 (mk_var("z",`:num`),0,10,10);
+  w 0xb9400141;         (* arm_LDR W1 X10 (Immediate_Offset (word 0)) *)
+  w 0x8b000020;         (* arm_ADD X0 X1 X0 *)
+  w 0x17fffff1          (* arm_B (word 268435396) *)
+]);;
+
+(* Compared to the result of define_asserts_from_elf, the return value of
+    define_assert_relocs_from_elf has the following differences:
+
+    1. It returns a_constants_data, which is a list of thm.
+      Each thm describes a definition of an object in a read-only section:
+
+      # a_constants_data;;
+
+      - : thm list =
+      [|- z_data = [word 30; word 0; word 0; word 0];
+       |- y_data = [word 1; word 0; word 0; word 0; ...];
+       |- x_data = [word 2; word 0; word 0; word 0; ...]]
+
+    2. The returned a_mc is a function that takes the addresses of pc, x, y and
+       (x and y are the addresses of the two constant arrays) and returns
+       the corresponding machine code.
+
+      # a_mc;;
+
+      - : thm =
+      |- forall x pc y. a_mc pc x y = CONS (word 227) (...)
+*)
+
+let EXEC = ARM_MK_EXEC_RULE a_mc;;
+
+(* Two helper tactics.
+   1. INTRO_READ_MEMORY_FROM_BYTES8_TAC t:
+      If t is `read (memory :> bytesN ...) sM`, prove a theorem
+      `read (memory :> bytesN ...) sM = <some expr>` and introduce it
+      as an assumption, from the existing `read (memory :> bytes8 ..) sM = ..`
+      assumptions.
+
+   2. EXPLODE_BYTELIST_ASSUM_TAC:
+      Find assumption `read (memory :> bytelist (...)) s = ..` and explode
+      it to a list of `read (memory :> bytes8 (...)) s = ..` and reintroduce
+      them as assumptions.
+*)
+let INTRO_READ_MEMORY_FROM_BYTES8_TAC (t:term) =
+  (* Convert t into word_joins of 1-byte reads. *)
+  let r = REWRITE_CONV [READ_MEMORY_BYTESIZED_SPLIT] t in
+  (* Offset canonicalization, and then rewrite using assumptions *)
+  let r = REWRITE_RULE[WORD_ADD_ASSOC_CONSTS;WORD_ADD_0;ARITH] r in
+  MP_TAC r THEN
+  ASM (GEN_REWRITE_TAC (LAND_CONV o ONCE_DEPTH_CONV)) [] THEN
+  CONV_TAC (LAND_CONV WORD_REDUCE_CONV) THEN
+  DISCH_TAC;;
+
+let EXPLODE_BYTELIST_ASSUM_TAC =
+  FIRST_X_ASSUM (fun th ->
+    let _ = find_term (fun t -> name_of t = "bytelist") (concl th) in
+    (* Unfold the constant arrays! *)
+    let unfolded_bytes_loaded = REWRITE_RULE a_constants_data th in
+    (* Fold LENGTH array, and explode arr using BYTELIST_EXPAND_CONV *)
+    MP_TAC (CONV_RULE (ONCE_DEPTH_CONV LENGTH_CONV THENC
+                      LAND_CONV BYTELIST_EXPAND_CONV)
+            unfolded_bytes_loaded)) THEN
+  (* [a;b;..] = [x;y;..] is a = x /\ b = y /\ ... *)
+  REWRITE_TAC [CONS_11] THEN
+  STRIP_TAC;;
+
+
+let F_SPEC = prove(`forall x y z i pc retpc.
+  // These two assumptions state that the distance between symbol x and pc+4
+  // (which is the first adrp) do not overflow, and so does symbol y and
+  // pc+20.
+  adrp_within_bounds (word x) (word (pc + 4)) /\
+  adrp_within_bounds (word y) (word (pc + 20)) /\
+  val i < 10
+  ==>
+  ensures arm
+    (\s. aligned_bytes_loaded s (word pc) (a_mc pc x y z) /\
+         read (memory :> bytelist (word x, LENGTH x_data)) s = x_data /\
+         read (memory :> bytelist (word y, LENGTH y_data)) s = y_data /\
+         read PC s = word pc /\
+         read X0 s = i /\
+         read X30 s = retpc)
+    (\s. read W0 s = word (3 * (1 + val i)) /\
+         read PC s = retpc)
+    (MAYCHANGE [X0; X1; X2; X3; X10; X11; PC] ,, MAYCHANGE [events])`,
+
+  REPEAT STRIP_TAC THEN
+  ENSURES_INIT_TAC "s0" THEN
+
+  (* Let's prove the constant array is storing some structured int sequence. *)
+  SUBGOAL_THEN
+      `read (memory :> bytes32 (word_add (word x) (word (4 * val (i:int64))))) s0 = word (2 * (val i+1)) /\
+       read (memory :> bytes32 (word_add (word y) (word (4 * val i)))) s0 = word (val i+1)`
+      MP_TAC THENL [
+
+    (* Explode the 40-byte constant memory reads into 40 1-bytes!
+       Do it twice, one for x and one for y. *)
+    REPEAT_N 2 EXPLODE_BYTELIST_ASSUM_TAC THEN
+
+    (* For each case where i < 10, concretely evaluate the values from the
+       exploded bytes, proving the equality. *)
+    ABBREV_TAC `i' = val (i:int64)` THEN
+    UNDISCH_TAC `i' < 10` THEN
+    SPEC_TAC (`i':num`,`i':num`) THEN
+    CONV_TAC EXPAND_CASES_CONV THEN
+    REWRITE_TAC[ARITH;WORD_ADD_0] THEN
+
+    REPEAT CONJ_TAC THEN (fun (asl,w) ->
+      INTRO_READ_MEMORY_FROM_BYTES8_TAC (lhs w) (asl,w)
+    ) THEN ASM_REWRITE_TAC[];
+
+    ALL_TAC
+  ] THEN
+
+  STRIP_TAC THEN
+
+  ARM_STEPS_TAC EXEC (1--3) THEN
+  FIRST_X_ASSUM (fun th -> MP_TAC th THEN IMP_REWRITE_TAC[ADRP_ADD_FOLD] THEN DISCH_TAC) THEN
+
+  ARM_STEPS_TAC EXEC (4--7) THEN
+  FIRST_X_ASSUM (fun th -> MP_TAC th THEN IMP_REWRITE_TAC[ADRP_ADD_FOLD] THEN DISCH_TAC) THEN
+
+  ARM_STEPS_TAC EXEC (8--11) THEN
+
+  (* Prove the postcondition. *)
+  ENSURES_FINAL_STATE_TAC THEN
+
+  ASM_REWRITE_TAC[WREG_EXPAND_CLAUSES;READ_ZEROTOP_32] THEN
+  REWRITE_TAC[WORD_BLAST`word_zx (word_zx (x:(32)word):(64)word):(32)word = x`] THEN
+  CONV_TAC WORD_RULE);;
+
+
+(* Proving the specification of function g(i) that calls f(i + z). *)
+
+let G_SPEC = prove(`forall x y z i pc retpc.
+  adrp_within_bounds (word x) (word (pc + 4)) /\
+  adrp_within_bounds (word y) (word (pc + 20)) /\
+  adrp_within_bounds (word z) (word (pc + 44)) /\
+  val i < 9
+  ==>
+  ensures arm
+    (\s. aligned_bytes_loaded s (word pc) (a_mc pc x y z) /\
+         read (memory :> bytelist (word x, LENGTH x_data)) s = x_data /\
+         read (memory :> bytelist (word y, LENGTH y_data)) s = y_data /\
+         read (memory :> bytelist (word z, LENGTH z_data)) s = z_data /\
+         read PC s = word (pc + 0x2c) /\
+         read X0 s = i /\
+         read X30 s = retpc)
+    (\s. read W0 s = word (3 * (2 + val i)) /\
+         read PC s = retpc)
+    (MAYCHANGE [X0; X1; X2; X3; X10; X11; PC] ,, MAYCHANGE [events])`,
+
+  REPEAT STRIP_TAC THEN
+
+  ENSURES_INIT_TAC "s0" THEN
+
+  ARM_STEPS_TAC EXEC (1--2) THEN
+  FIRST_X_ASSUM (fun th -> MP_TAC th THEN IMP_REWRITE_TAC[ADRP_ADD_FOLD] THEN DISCH_TAC) THEN
+
+  (* Prepare load z. *)
+  EXPLODE_BYTELIST_ASSUM_TAC THEN
+  INTRO_READ_MEMORY_FROM_BYTES8_TAC
+    `read (memory :> bytes32 (word z)) s2` THEN
+  (* Expand read W0 to read X0. *)
+  RULE_ASSUM_TAC(REWRITE_RULE[WREG_EXPAND_CLAUSES;READ_ZEROTOP_32]) THEN
+  ARM_STEPS_TAC EXEC (3--4) THEN
+
+  SUBGOAL_THEN `val (word_add (word 1) i:int64) < 10` ASSUME_TAC THENL [
+    REWRITE_TAC[VAL_WORD_ADD;VAL_WORD;DIMINDEX_64] THEN ASM_ARITH_TAC;
+    ALL_TAC
+  ] THEN
+  ARM_STEPS_TAC EXEC [5] THEN
+
+  (* Call ARM_SUBROUTINE_SIM_TAC with its arguments. *)
+  ARM_SUBROUTINE_SIM_TAC
+   (SPEC_ALL a_mc,EXEC,0,SPEC_ALL a_mc,F_SPEC)
+   [`x:num`;`y:num`;`z:num`;`read X0 s`;
+    `pc:num`; `read X30 s`] 6 THEN
+
+  (* Prove the postcondition. *)
+  ENSURES_FINAL_STATE_TAC THEN
+
+  ASM_REWRITE_TAC[VAL_WORD_ADD;DIMINDEX_64] THEN
+  AP_TERM_TAC THEN CONV_TAC WORD_REDUCE_CONV THEN
+  IMP_REWRITE_TAC[MOD_LT] THEN ASM_ARITH_TAC);;
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.S
new file mode 100644
index 00000000000..c3a16766210
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.S
@@ -0,0 +1,4 @@
+add x1, x1, x0
+add x2, x2, x0
+mov x3, #2
+mul x1, x1, x3
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.ml
new file mode 100644
index 00000000000..51dadf98bb1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.ml
@@ -0,0 +1,101 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+  Prove a property of a simple program by splitting into two sequential
+  chunks with an intermediate assertion.
+******************************************************************************)
+
+(* Please copy this file to the root directory of s2n-bignum, then
+   follow the instructions. *)
+
+needs "arm/proofs/base.ml";;
+
+(* Given a program
+   0:   8b000021        add     x1, x1, x0
+   4:   8b000042        add     x2, x2, x0
+   8:   d2800043        mov     x3, #0x2
+   c:   9b037c21        mul     x1, x1, x3
+
+  Let's prove that x1 in the final state is (x1 + x0) * 2.
+  As done in "simple.ml", this can be done using symbolic execution. However,
+  in this file, we will try a slightly different approach:
+  (1) The program will be splitted into two smaller programs:
+
+  First prog:
+   0:   8b000021        add     x1, x1, x0
+   4:   8b000042        add     x2, x2, x0
+
+  Second prog:
+   8:   d2800043        mov     x3, #0x2
+   c:   9b037c21        mul     x1, x1, x3
+
+  (2) Each program will have its 'ensures' predicate specifying the pre and
+      postcondition. The postcondition of the first program will be equivalent to
+      the second one.
+  (3) By proving the two 'ensures' predicate, the specification of whole
+      program can be proven.
+*)
+
+let sequence_mc = new_definition `sequence_mc = [
+    word 0x21; word 0x00; word 0x00; word 0x8b; // add x2, x1, x0
+    word 0x42; word 0x00; word 0x00; word 0x8b; // add x2, x2, x0
+    word 0x43; word 0x00; word 0x80; word 0xd2; // mov x3, #0x2
+    word 0x21; word 0x7c; word 0x03; word 0x9b  // mul x1, x1, x3
+  ]:((8)word)list`;;
+
+let EXEC = ARM_MK_EXEC_RULE sequence_mc;;
+
+let sequence_SPEC = prove(
+  `forall pc a b.
+  ensures arm
+    // Precondition
+    (\s. aligned_bytes_loaded s (word pc) sequence_mc /\
+         read PC s = word pc /\
+         read X0 s = word a /\
+         read X1 s = word b /\
+         read X2 s = word c)
+    // Postcondition
+    (\s. read PC s = word (pc+16) /\
+         read X1 s = word ((a + b) * 2))
+    // Registers (and memory locations) that may change after execution
+    (MAYCHANGE [PC;X1;X2;X3])`,
+  (* Strips the outermost universal quantifier from the conclusion of a goal *)
+  REPEAT STRIP_TAC THEN
+
+  (* Use ENSURES_SEQUENCE_TAC to split the program into two chunks:
+     [pc, pc+8) and [pc+8, pc+16). The second argument of the tactic
+     `\s. read X1 s = word (a + b)` is a lambda function stating the
+     intermediate state at pc+8.
+     The result of this tactic will be a conjunction of two ensures,
+     the left clause of which is a spec of the first chunk and the
+     right clause is the right one. *)
+  ENSURES_SEQUENCE_TAC
+    `pc + 8`
+    `\s. read X1 s = word (a + b)` THEN
+
+  (* Split the conjunction and create two subgoals. *)
+  CONJ_TAC THENL [
+    (* The first subgoal. *)
+    (* Now we can use the symbolic execution tactics introduced in "simple.ml". *)
+    (* Start symbolic execution with state 's0' *)
+    ENSURES_INIT_TAC "s0" THEN
+    (* Symbolically run two instructions *)
+    ARM_STEPS_TAC EXEC (1--2) THEN
+    (* Try to prove the postcondition and frame as much as possible *)
+    ENSURES_FINAL_STATE_TAC THEN
+    (* Use ASM_REWRITE_TAC[] to rewrite the goal using equalities in assumptions. *)
+    ASM_REWRITE_TAC[] THEN
+    (* Prove: `word_add (word b) (word a) = word (a + b)` *)
+    CONV_TAC WORD_RULE;
+
+    (* The second subgoal *)
+    ENSURES_INIT_TAC "s0" THEN
+    ARM_STEPS_TAC EXEC (1--2) THEN
+    ENSURES_FINAL_STATE_TAC THEN
+    ASM_REWRITE_TAC[] THEN
+    (* Prove: `word (0 + val (word (a + b)) * 2) = word ((a + b) * 2)` *)
+    CONV_TAC WORD_RULE;
+  ]);;
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.S
new file mode 100644
index 00000000000..9439996e1af
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.S
@@ -0,0 +1,2 @@
+add x2, x1, x0
+sub x2, x2, x1
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.ml
new file mode 100644
index 00000000000..df22765e54b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.ml
@@ -0,0 +1,107 @@
+(*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ *)
+
+(******************************************************************************
+  Proving a simple property about program 'simple.S'
+******************************************************************************)
+
+(* Please copy this file to the root directory of s2n-bignum, then
+   follow the instructions. *)
+
+needs "arm/proofs/base.ml";;
+
+(* Let's prove a simple property of the following program:
+
+   0:   8b000022        add     x2, x1, x0
+   4:   cb010042        sub     x2, x2, x1
+
+  Let's start with defining a byte sequence of a program 'simple.S'
+*)
+let simple_mc = new_definition `simple_mc = [
+    word 0x22; word 0x00; word 0x00; word 0x8b; // add x2, x1, x0
+    word 0x42; word 0x00; word 0x01; word 0xcb  // sub x2, x2, x1
+  ]:((8)word)list`;;
+
+(* Or, you can read .o file and store the byte list as follows:
+let simple_mc = define_assert_from_elf "simple_mc" "arm/tutorial/simple.o"
+[
+  0x8b000022;       (* arm_ADD X2 X1 X0 *)
+  0xcb010042        (* arm_SUB X2 X2 X1 *)
+];;
+
+You can get the above OCaml list data structure from
+`print_literal_from_elf "<.o file>"` or `save_literal_from_elf "<out.txt>"
+"<.o file>"`.
+*)
+
+(* ARM_MK_EXEC_RULE decodes the byte sequence into conjunction of
+  equalities between the bytes and instructions. *)
+let EXEC = ARM_MK_EXEC_RULE simple_mc;;
+
+(*
+  In s2n-bignum, a specification (ensures) has three components:
+  1. precondition: assume that a program starts from some program state satisfying the critera
+  2. postcondition: the program must reach to a program state satisfying the criteria
+  3. frame: the start program state and end program state must satisfy this relation
+     (e.g., this program only changes callee-save register)
+  In this file,
+  1. precondition is:
+    - the 'simple' binary is loaded at some location in memory, say 'pc'
+    - the arm program counter register, PC, has value pc
+    - the arm register X0 has a symbolic value a and X1 has a symbolic value b
+  2. postcondition is:
+    - the arm program counter register, PC, has value pc+8
+      (meaning that two instructions have been executed)
+    - the arm register X2 has value b
+  3. frame is:
+    - the register values of PC and X2 might have been changed
+
+  If you are using the VSCode plugin of HOL Light, you can ctrl+click
+  (cmd+click for Mac) to jump to definitions.
+*)
+let SIMPLE_SPEC = prove(
+  `forall pc a b.
+  ensures arm
+    // Precondition
+    (\s. // aligned_bytes_loaded states that a byte sequence 'simple_mc'
+         // is loaded at memory location 'pc' in the state 's' and also
+         // 4-bytes aligned.
+         aligned_bytes_loaded s (word pc) simple_mc /\
+         // 'word' is a bit-vector type in HOL Light.
+         // 'word a' means it is a bit-vector whose numeral (:num type)
+         // is 'a'. Its bit-width is inferred as 64 bits here, but it can
+         // be manually annotated as (word a:(64)word).
+         read PC s = word pc /\
+         read X0 s = word a /\
+         read X1 s = word b)
+    // Postcondition
+    (\s. read PC s = word (pc+8) /\
+         read X2 s = word a)
+    // Registers (and memory locations) that may change after execution
+    (MAYCHANGE [PC;X2])`,
+
+  (* Strips the outermost universal quantifier from the conclusion of a goal *)
+  REPEAT STRIP_TAC THEN
+  (* Start symbolic execution with state 's0' *)
+  ENSURES_INIT_TAC "s0" THEN
+
+  (* Symbolically run two instructions *)
+  ARM_STEPS_TAC EXEC (1--2) THEN
+  (* Try to prove the postcondition and frame as much as possible *)
+  ENSURES_FINAL_STATE_TAC THEN
+
+  (* Use ASM_REWRITE_TAC[] to rewrite the goal using equalities in assumptions. *)
+  ASM_REWRITE_TAC[] THEN
+  (* We need to prove this:
+     `word_sub (word_add (word b) (word a)) (word b) = word a`
+     Use an automated prover for words in HOL Light *)
+  CONV_TAC WORD_RULE);;
+
+(* Note that symbolic simulator will discard the output of instructions
+   if its inputs do not have their symbolic expressions defined in assumption.
+   To list which instructions are discarded by the simulation tactic.
+   set:
+    arm_print_log := true;;
+   This flag will also print helpful informations that are useful. *)
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/include/_internal_s2n_bignum.h b/third_party/s2n-bignum/s2n-bignum-imported/include/_internal_s2n_bignum.h
new file mode 100644
index 00000000000..98181a5779b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/include/_internal_s2n_bignum.h
@@ -0,0 +1,41 @@
+
+#ifdef __APPLE__
+#   define S2N_BN_SYMBOL(NAME) _##NAME
+#   if defined(__AARCH64EL__) || defined(__ARMEL__)
+#     define __LF %%
+#   else
+#     define __LF ;
+#   endif
+#else
+#   define S2N_BN_SYMBOL(name) name
+#   define __LF ;
+#endif
+
+#define S2N_BN_SYM_VISIBILITY_DIRECTIVE(name) .globl S2N_BN_SYMBOL(name)
+#ifdef S2N_BN_HIDE_SYMBOLS
+#   ifdef __APPLE__
+#      define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) .private_extern S2N_BN_SYMBOL(name)
+#   else
+#      define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) .hidden S2N_BN_SYMBOL(name)
+#   endif
+#else
+#   define S2N_BN_SYM_PRIVACY_DIRECTIVE(name)  /* NO-OP: S2N_BN_SYM_PRIVACY_DIRECTIVE */
+#endif
+
+// Enable indirect branch tracking support unless explicitly disabled
+// with -DNO_IBT. If the platform supports CET, simply inherit this from
+// the usual header. Otherwise manually define _CET_ENDBR, used at each
+// x86 entry point, to be the ENDBR64 instruction, with an explicit byte
+// sequence for compilers/assemblers that don't know about it. Note that
+// it is safe to use ENDBR64 on all platforms, since the encoding is by
+// design interpreted as a NOP on all pre-CET x86_64 processors. The only
+// downside is a small increase in code size and potentially a modest
+// slowdown from executing one more instruction.
+
+#if NO_IBT
+#define _CET_ENDBR
+#elif defined(__CET__)
+#include <cet.h>
+#else
+#define _CET_ENDBR .byte 0xf3,0x0f,0x1e,0xfa
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum-c89.h b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum-c89.h
new file mode 100644
index 00000000000..ca9bec37c72
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum-c89.h
@@ -0,0 +1,1114 @@
+/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+ */
+
+/* ----------------------------------------------------------------------------
+ * C prototypes for s2n-bignum functions, so you can use them in C programs via
+ *
+ *  #include "s2n-bignum-c89.h"
+ *
+ * The functions are listed in alphabetical order with a brief description
+ * in comments for each one. For more detailed documentation see the comment
+ * banner at the top of the corresponding assembly (.S) file, and
+ * for the last word in what properties it satisfies see the spec in the
+ * formal proof (the .ml file in the architecture-specific directory).
+ *
+ * For some functions there are additional variants with names ending in
+ * "_alt". These have the same core mathematical functionality as their
+ * non-"alt" versions, but can be better suited to some microarchitectures:
+ *
+ *      - On x86, the "_alt" forms avoid BMI and ADX instruction set
+ *        extensions, so will run on any x86_64 machine, even older ones
+ *
+ *      - On ARM, the "_alt" forms target machines with higher multiplier
+ *        throughput, generally offering higher performance there.
+ * ----------------------------------------------------------------------------
+ */
+
+/*  Add, z := x + y */
+/*  Inputs x[m], y[n]; outputs function return (carry-out) and z[p] */
+extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+/*  Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_add_p256 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_add_p25519 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_add_p256k1 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced */
+/*  Inputs x[6], y[6]; output z[6] */
+extern void bignum_add_p384 (uint64_t z[6], uint64_t x[6], uint64_t y[6]);
+
+/*  Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced */
+/*  Inputs x[9], y[9]; output z[9] */
+extern void bignum_add_p521 (uint64_t z[9], uint64_t x[9], uint64_t y[9]);
+
+/*  Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_add_sm2 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Compute "amontification" constant z :== 2^{128k} (congruent mod m) */
+/*  Input m[k]; output z[k]; temporary buffer t[>=k] */
+extern void bignum_amontifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+
+/*  Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m) */
+/*  Inputs x[k], y[k], m[k]; output z[k] */
+extern void bignum_amontmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+
+/*  Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m) */
+/*  Inputs x[n], m[k], p; output z[k] */
+extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+
+/*  Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m) */
+/*  Inputs x[k], m[k]; output z[k] */
+extern void bignum_amontsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+
+/*  Convert 4-digit (256-bit) bignum to/from big-endian form */
+/*  Input x[4]; output z[4] */
+extern void bignum_bigendian_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Convert 6-digit (384-bit) bignum to/from big-endian form */
+/*  Input x[6]; output z[6] */
+extern void bignum_bigendian_6 (uint64_t z[6], uint64_t x[6]);
+
+/*  Select bitfield starting at bit n with length l <= 64 */
+/*  Inputs x[k], n, l; output function return */
+extern uint64_t bignum_bitfield (uint64_t k, uint64_t *x, uint64_t n, uint64_t l);
+
+/*  Return size of bignum in bits */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x);
+
+/*  Divide by a single (nonzero) word, z := x / m and return x mod m */
+/*  Inputs x[n], m; outputs function return (remainder) and z[k] */
+extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+
+/*  Divide by a single word, z := x / m when known to be exact */
+/*  Inputs x[n], m; output z[k] */
+extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+
+/*  Count leading zero digits (64-bit words) */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_cld (uint64_t k, uint64_t *x);
+
+/*  Count leading zero bits */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_clz (uint64_t k, uint64_t *x);
+
+/*  Multiply-add with single-word multiplier, z := z + c * y */
+/*  Inputs c, y[n]; outputs function return (carry-out) and z[k] */
+extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+
+/*  Negated multiply-add with single-word multiplier, z := z - c * y */
+/*  Inputs c, y[n]; outputs function return (negative carry-out) and z[k] */
+extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+
+/*  Find modulus of bignum w.r.t. single nonzero word m, returning x mod m */
+/*  Input x[k], m; output function return */
+extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m);
+
+/*  Multiply by a single word, z := c * y */
+/*  Inputs c, y[n]; outputs function return (carry-out) and z[k] */
+extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+
+/*  Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming x reduced */
+/*  Inputs c, x[4]; output z[4] */
+extern void bignum_cmul_p25519 (uint64_t z[4], uint64_t c, uint64_t x[4]);
+extern void bignum_cmul_p25519_alt (uint64_t z[4], uint64_t c, uint64_t x[4]);
+
+/*  Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming x reduced */
+/*  Inputs c, x[4]; output z[4] */
+extern void bignum_cmul_p256 (uint64_t z[4], uint64_t c, uint64_t x[4]);
+extern void bignum_cmul_p256_alt (uint64_t z[4], uint64_t c, uint64_t x[4]);
+
+/*  Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming x reduced */
+/*  Inputs c, x[4]; output z[4] */
+extern void bignum_cmul_p256k1 (uint64_t z[4], uint64_t c, uint64_t x[4]);
+extern void bignum_cmul_p256k1_alt (uint64_t z[4], uint64_t c, uint64_t x[4]);
+
+/*  Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming x reduced */
+/*  Inputs c, x[6]; output z[6] */
+extern void bignum_cmul_p384 (uint64_t z[6], uint64_t c, uint64_t x[6]);
+extern void bignum_cmul_p384_alt (uint64_t z[6], uint64_t c, uint64_t x[6]);
+
+/*  Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming x reduced */
+/*  Inputs c, x[9]; output z[9] */
+extern void bignum_cmul_p521 (uint64_t z[9], uint64_t c, uint64_t x[9]);
+extern void bignum_cmul_p521_alt (uint64_t z[9], uint64_t c, uint64_t x[9]);
+
+/* Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming x reduced */
+/* Inputs c, x[4]; output z[4] */
+extern void bignum_cmul_sm2 (uint64_t z[4], uint64_t c, uint64_t x[4]);
+extern void bignum_cmul_sm2_alt (uint64_t z[4], uint64_t c, uint64_t x[4]);
+
+/*  Test bignums for coprimality, gcd(x,y) = 1 */
+/*  Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)] */
+extern uint64_t bignum_coprime (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t);
+
+/*  Copy bignum with zero-extension or truncation, z := x */
+/*  Input x[n]; output z[k] */
+extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+
+/* Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] */
+/* into z[0..width-1]. */
+/* This function is constant-time with respect to the value of `idx`. This is */
+/* achieved by reading the whole table and using the bit-masking to get the */
+/* `idx`-th row. */
+/* Input table[height*width]; output z[width] */
+extern void bignum_copy_row_from_table (uint64_t *z, uint64_t *table, uint64_t height,
+        uint64_t width, uint64_t idx);
+
+/* Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] */
+/* into z[0..width-1]. width must be a multiple of 8. */
+/* This function is constant-time with respect to the value of `idx`. This is */
+/* achieved by reading the whole table and using the bit-masking to get the */
+/* `idx`-th row. */
+/* Input table[height*width]; output z[width] */
+extern void bignum_copy_row_from_table_8n (uint64_t *z, uint64_t *table,
+        uint64_t height, uint64_t width, uint64_t idx);
+
+/* Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1]. */
+/* This function is constant-time with respect to the value of `idx`. This is */
+/* achieved by reading the whole table and using the bit-masking to get the */
+/* `idx`-th row. */
+/* Input table[height*16]; output z[16] */
+extern void bignum_copy_row_from_table_16 (uint64_t *z, uint64_t *table,
+        uint64_t height, uint64_t idx);
+
+/* Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1]. */
+/* This function is constant-time with respect to the value of `idx`. This is */
+/* achieved by reading the whole table and using the bit-masking to get the */
+/* `idx`-th row. */
+/* Input table[height*32]; output z[32] */
+extern void bignum_copy_row_from_table_32 (uint64_t *z, uint64_t *table,
+        uint64_t height, uint64_t idx);
+
+/*  Count trailing zero digits (64-bit words) */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_ctd (uint64_t k, uint64_t *x);
+
+/*  Count trailing zero bits */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_ctz (uint64_t k, uint64_t *x);
+
+/*  Convert from almost-Montgomery form, z := (x / 2^256) mod p_256 */
+/*  Input x[4]; output z[4] */
+extern void bignum_deamont_p256 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_deamont_p256_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Convert from almost-Montgomery form, z := (x / 2^256) mod p_256k1 */
+/*  Input x[4]; output z[4] */
+extern void bignum_deamont_p256k1 (uint64_t z[4], uint64_t x[4]);
+
+/*  Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 */
+/*  Input x[6]; output z[6] */
+extern void bignum_deamont_p384 (uint64_t z[6], uint64_t x[6]);
+extern void bignum_deamont_p384_alt (uint64_t z[6], uint64_t x[6]);
+
+/*  Convert from almost-Montgomery form z := (x / 2^576) mod p_521 */
+/*  Input x[9]; output z[9] */
+extern void bignum_deamont_p521 (uint64_t z[9], uint64_t x[9]);
+
+/* Convert from almost-Montgomery form z := (x / 2^256) mod p_sm2 */
+/* Input x[4]; output z[4] */
+extern void bignum_deamont_sm2 (uint64_t z[4], uint64_t x[4]);
+
+/*  Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m */
+/*  Inputs x[k], m[k]; output z[k] */
+extern void bignum_demont (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+
+/*  Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_demont_p256 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_demont_p256_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Convert from Montgomery form z := (x / 2^256) mod p_256k1, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_demont_p256k1 (uint64_t z[4], uint64_t x[4]);
+
+/*  Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced */
+/*  Input x[6]; output z[6] */
+extern void bignum_demont_p384 (uint64_t z[6], uint64_t x[6]);
+extern void bignum_demont_p384_alt (uint64_t z[6], uint64_t x[6]);
+
+/*  Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced */
+/*  Input x[9]; output z[9] */
+extern void bignum_demont_p521 (uint64_t z[9], uint64_t x[9]);
+
+/* Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced */
+/* Input x[4]; output z[4] */
+extern void bignum_demont_sm2 (uint64_t z[4], uint64_t x[4]);
+
+/*  Select digit x[n] */
+/*  Inputs x[k], n; output function return */
+extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n);
+
+/*  Return size of bignum in digits (64-bit word) */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x);
+
+/*  Divide bignum by 10: z' := z div 10, returning remainder z mod 10 */
+/*  Inputs z[k]; outputs function return (remainder) and z[k] */
+extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z);
+
+/*  Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_double_p25519 (uint64_t z[4], uint64_t x[4]);
+
+/*  Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_double_p256 (uint64_t z[4], uint64_t x[4]);
+
+/*  Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_double_p256k1 (uint64_t z[4], uint64_t x[4]);
+
+/*  Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced */
+/*  Input x[6]; output z[6] */
+extern void bignum_double_p384 (uint64_t z[6], uint64_t x[6]);
+
+/*  Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced */
+/*  Input x[9]; output z[9] */
+extern void bignum_double_p521 (uint64_t z[9], uint64_t x[9]);
+
+/*  Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_double_sm2 (uint64_t z[4], uint64_t x[4]);
+
+/*  Extended Montgomery reduce, returning results in input-output buffer */
+/*  Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] */
+extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+
+/*  Extended Montgomery reduce in 8-digit blocks, results in input-output buffer */
+/*  Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] */
+extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+// Temporary buffer m_precalc[12*(k/4-1)]
+extern uint64_t bignum_emontredc_8n_cdiff (uint64_t k, uint64_t *z, uint64_t *m,
+                                          uint64_t w, uint64_t *m_precalc);
+/*  Test bignums for equality, x = y */
+/*  Inputs x[m], y[n]; output function return */
+extern uint64_t bignum_eq (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+/*  Test bignum for even-ness */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_even (uint64_t k, uint64_t *x);
+
+/*  Convert 4-digit (256-bit) bignum from big-endian bytes */
+/*  Input x[32] (bytes); output z[4] */
+extern void bignum_frombebytes_4 (uint64_t z[4], uint8_t x[32]);
+
+/*  Convert 6-digit (384-bit) bignum from big-endian bytes */
+/*  Input x[48] (bytes); output z[6] */
+extern void bignum_frombebytes_6 (uint64_t z[6], uint8_t x[48]);
+
+/*  Convert 4-digit (256-bit) bignum from little-endian bytes */
+/*  Input x[32] (bytes); output z[4] */
+extern void bignum_fromlebytes_4 (uint64_t z[4], uint8_t x[32]);
+
+/*  Convert 6-digit (384-bit) bignum from little-endian bytes */
+/*  Input x[48] (bytes); output z[6] */
+extern void bignum_fromlebytes_6 (uint64_t z[6], uint8_t x[48]);
+
+/* Convert little-endian bytes to 9-digit 528-bit bignum      */
+/* Input x[66] (bytes); output z[9] */
+extern void bignum_fromlebytes_p521 (uint64_t z[9],uint8_t x[66]);
+
+/*  Compare bignums, x >= y */
+/*  Inputs x[m], y[n]; output function return */
+extern uint64_t bignum_ge (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+/*  Compare bignums, x > y */
+/*  Inputs x[m], y[n]; output function return */
+extern uint64_t bignum_gt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+/*  Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_half_p256 (uint64_t z[4], uint64_t x[4]);
+
+/*  Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_half_p256k1 (uint64_t z[4], uint64_t x[4]);
+
+/*  Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced */
+/*  Input x[6]; output z[6] */
+extern void bignum_half_p384 (uint64_t z[6], uint64_t x[6]);
+
+/*  Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced */
+/*  Input x[9]; output z[9] */
+extern void bignum_half_p521 (uint64_t z[9], uint64_t x[9]);
+
+/*  Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_half_sm2 (uint64_t z[4], uint64_t x[4]);
+
+/* Modular inverse modulo p_25519 = 2^255 - 19 */
+/* Input x[4]; output z[4] */
+extern void bignum_inv_p25519(uint64_t z[4],uint64_t x[4]);
+
+/* Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 */
+/* Input x[4]; output z[4] */
+extern void bignum_inv_p256(uint64_t z[4],uint64_t x[4]);
+
+/* Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 */
+/* Input x[6]; output z[6] */
+extern void bignum_inv_p384(uint64_t z[6],uint64_t x[6]);
+
+/* Modular inverse modulo p_521 = 2^521 - 1 */
+/* Input x[9]; output z[9] */
+extern void bignum_inv_p521(uint64_t z[9],uint64_t x[9]);
+
+/* Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 */
+/* Input x[4]; output z[4] */
+extern void bignum_inv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+/* Inverse square root modulo p_25519 */
+/* Input x[4]; output function return (Legendre symbol) and z[4] */
+extern int64_t bignum_invsqrt_p25519(uint64_t z[4],uint64_t x[4]);
+extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[4],uint64_t x[4]);
+
+/*  Test bignum for zero-ness, x = 0 */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_iszero (uint64_t k, uint64_t *x);
+
+/*  Multiply z := x * y */
+/*  Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] */
+extern void bignum_kmul_16_32 (uint64_t z[32], uint64_t x[16], uint64_t y[16], uint64_t t[32]);
+
+/*  Multiply z := x * y */
+/*  Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] */
+extern void bignum_kmul_32_64 (uint64_t z[64], uint64_t x[32], uint64_t y[32], uint64_t t[96]);
+
+/*  Square, z := x^2 */
+/*  Input x[16]; output z[32]; temporary buffer t[>=24] */
+extern void bignum_ksqr_16_32 (uint64_t z[32], uint64_t x[16], uint64_t t[24]);
+
+/*  Square, z := x^2 */
+/*  Input x[32]; output z[64]; temporary buffer t[>=72] */
+extern void bignum_ksqr_32_64 (uint64_t z[64], uint64_t x[32], uint64_t t[72]);
+
+/*  Compare bignums, x <= y */
+/*  Inputs x[m], y[n]; output function return */
+extern uint64_t bignum_le (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+/*  Convert 4-digit (256-bit) bignum to/from little-endian form */
+/*  Input x[4]; output z[4] */
+extern void bignum_littleendian_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Convert 6-digit (384-bit) bignum to/from little-endian form */
+/*  Input x[6]; output z[6] */
+extern void bignum_littleendian_6 (uint64_t z[6], uint64_t x[6]);
+
+/*  Compare bignums, x < y */
+/*  Inputs x[m], y[n]; output function return */
+extern uint64_t bignum_lt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+/*  Multiply-add, z := z + x * y */
+/*  Inputs x[m], y[n]; outputs function return (carry-out) and z[k] */
+extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+/* Multiply-add modulo the order of the curve25519/edwards25519 basepoint */
+/* Inputs x[4], y[4], c[4]; output z[4] */
+extern void bignum_madd_n25519 (uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4]);
+extern void bignum_madd_n25519_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4]);
+
+/*  Reduce modulo group order, z := x mod m_25519 */
+/*  Input x[4]; output z[4] */
+extern void bignum_mod_m25519_4 (uint64_t z[4], uint64_t x[4]);
+
+/* Reduce modulo basepoint order, z := x mod n_25519 */
+/* Input x[k]; output z[4] */
+extern void bignum_mod_n25519 (uint64_t z[4], uint64_t k, uint64_t *x);
+
+/*  Reduce modulo basepoint order, z := x mod n_25519 */
+/*  Input x[4]; output z[4] */
+extern void bignum_mod_n25519_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Reduce modulo group order, z := x mod n_256 */
+/*  Input x[k]; output z[4] */
+extern void bignum_mod_n256 (uint64_t z[4], uint64_t k, uint64_t *x);
+extern void bignum_mod_n256_alt (uint64_t z[4], uint64_t k, uint64_t *x);
+
+/*  Reduce modulo group order, z := x mod n_256 */
+/*  Input x[4]; output z[4] */
+extern void bignum_mod_n256_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Reduce modulo group order, z := x mod n_256k1 */
+/*  Input x[4]; output z[4] */
+extern void bignum_mod_n256k1_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Reduce modulo group order, z := x mod n_384 */
+/*  Input x[k]; output z[6] */
+extern void bignum_mod_n384 (uint64_t z[6], uint64_t k, uint64_t *x);
+extern void bignum_mod_n384_alt (uint64_t z[6], uint64_t k, uint64_t *x);
+
+/*  Reduce modulo group order, z := x mod n_384 */
+/*  Input x[6]; output z[6] */
+extern void bignum_mod_n384_6 (uint64_t z[6], uint64_t x[6]);
+
+/*  Reduce modulo group order, z := x mod n_521 */
+/*  Input x[9]; output z[9] */
+extern void bignum_mod_n521_9 (uint64_t z[9], uint64_t x[9]);
+extern void bignum_mod_n521_9_alt (uint64_t z[9], uint64_t x[9]);
+
+/*  Reduce modulo group order, z := x mod n_sm2 */
+/*  Input x[k]; output z[4] */
+extern void bignum_mod_nsm2 (uint64_t z[4], uint64_t k, uint64_t *x);
+extern void bignum_mod_nsm2_alt (uint64_t z[4], uint64_t k, uint64_t *x);
+
+/*  Reduce modulo group order, z := x mod n_sm2 */
+/*  Input x[4]; output z[4] */
+extern void bignum_mod_nsm2_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Reduce modulo field characteristic, z := x mod p_25519 */
+/*  Input x[4]; output z[4] */
+extern void bignum_mod_p25519_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Reduce modulo field characteristic, z := x mod p_256 */
+/*  Input x[k]; output z[4] */
+extern void bignum_mod_p256 (uint64_t z[4], uint64_t k, uint64_t *x);
+extern void bignum_mod_p256_alt (uint64_t z[4], uint64_t k, uint64_t *x);
+
+/*  Reduce modulo field characteristic, z := x mod p_256 */
+/*  Input x[4]; output z[4] */
+extern void bignum_mod_p256_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Reduce modulo field characteristic, z := x mod p_256k1 */
+/*  Input x[4]; output z[4] */
+extern void bignum_mod_p256k1_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Reduce modulo field characteristic, z := x mod p_384 */
+/*  Input x[k]; output z[6] */
+extern void bignum_mod_p384 (uint64_t z[6], uint64_t k, uint64_t *x);
+extern void bignum_mod_p384_alt (uint64_t z[6], uint64_t k, uint64_t *x);
+
+/*  Reduce modulo field characteristic, z := x mod p_384 */
+/*  Input x[6]; output z[6] */
+extern void bignum_mod_p384_6 (uint64_t z[6], uint64_t x[6]);
+
+/*  Reduce modulo field characteristic, z := x mod p_521 */
+/*  Input x[9]; output z[9] */
+extern void bignum_mod_p521_9 (uint64_t z[9], uint64_t x[9]);
+
+/* Reduce modulo field characteristic, z := x mod p_sm2 */
+/* Input x[k]; output z[4] */
+extern void bignum_mod_sm2 (uint64_t z[4], uint64_t k, uint64_t *x);
+
+/* Reduce modulo field characteristic, z := x mod p_sm2 */
+/* Input x[4]; output z[4] */
+extern void bignum_mod_sm2_4 (uint64_t z[4], uint64_t x[4]);
+
+/*  Add modulo m, z := (x + y) mod m, assuming x and y reduced */
+/*  Inputs x[k], y[k], m[k]; output z[k] */
+extern void bignum_modadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+
+/*  Double modulo m, z := (2 * x) mod m, assuming x reduced */
+/*  Inputs x[k], m[k]; output z[k] */
+extern void bignum_moddouble (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+
+/* Modular exponentiation for arbitrary odd modulus, z := (a^p) mod m */
+/* Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k] */
+extern void bignum_modexp(uint64_t k,uint64_t *z, uint64_t *a,uint64_t *p,uint64_t *m,uint64_t *t);
+
+/*  Compute "modification" constant z := 2^{64k} mod m */
+/*  Input m[k]; output z[k]; temporary buffer t[>=k] */
+extern void bignum_modifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+
+/*  Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, a coprime to b */
+/*  Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k] */
+extern void bignum_modinv (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t);
+
+/*  Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x (if p zero), assuming x reduced */
+/*  Inputs p, x[k], m[k]; output z[k] */
+extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m);
+
+/*  Subtract modulo m, z := (x - y) mod m, assuming x and y reduced */
+/*  Inputs x[k], y[k], m[k]; output z[k] */
+extern void bignum_modsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+
+/*  Compute "montification" constant z := 2^{128k} mod m */
+/*  Input m[k]; output z[k]; temporary buffer t[>=k] */
+extern void bignum_montifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+
+/* Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 */
+/* Input x[4]; output z[4] */
+extern void bignum_montinv_p256(uint64_t z[4],uint64_t x[4]);
+
+/* Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 */
+/* Input x[6]; output z[6] */
+extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+/* Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 */
+/* Input x[4]; output z[4] */
+extern void bignum_montinv_sm2(uint64_t z[4],uint64_t x[4]);
+
+/*  Montgomery multiply, z := (x * y / 2^{64k}) mod m */
+/*  Inputs x[k], y[k], m[k]; output z[k] */
+extern void bignum_montmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+
+/*  Montgomery multiply, z := (x * y / 2^256) mod p_256 */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_montmul_p256 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+extern void bignum_montmul_p256_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Montgomery multiply, z := (x * y / 2^256) mod p_256k1 */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_montmul_p256k1 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+extern void bignum_montmul_p256k1_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Montgomery multiply, z := (x * y / 2^384) mod p_384 */
+/*  Inputs x[6], y[6]; output z[6] */
+extern void bignum_montmul_p384 (uint64_t z[6], uint64_t x[6], uint64_t y[6]);
+extern void bignum_montmul_p384_alt (uint64_t z[6], uint64_t x[6], uint64_t y[6]);
+
+/*  Montgomery multiply, z := (x * y / 2^576) mod p_521 */
+/*  Inputs x[9], y[9]; output z[9] */
+extern void bignum_montmul_p521 (uint64_t z[9], uint64_t x[9], uint64_t y[9]);
+extern void bignum_montmul_p521_alt (uint64_t z[9], uint64_t x[9], uint64_t y[9]);
+
+/*  Montgomery multiply, z := (x * y / 2^256) mod p_sm2 */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_montmul_sm2 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+extern void bignum_montmul_sm2_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Montgomery reduce, z := (x' / 2^{64p}) MOD m */
+/*  Inputs x[n], m[k], p; output z[k] */
+extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+
+/*  Montgomery square, z := (x^2 / 2^{64k}) mod m */
+/*  Inputs x[k], m[k]; output z[k] */
+extern void bignum_montsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+
+/*  Montgomery square, z := (x^2 / 2^256) mod p_256 */
+/*  Input x[4]; output z[4] */
+extern void bignum_montsqr_p256 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_montsqr_p256_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Montgomery square, z := (x^2 / 2^256) mod p_256k1 */
+/*  Input x[4]; output z[4] */
+extern void bignum_montsqr_p256k1 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_montsqr_p256k1_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Montgomery square, z := (x^2 / 2^384) mod p_384 */
+/*  Input x[6]; output z[6] */
+extern void bignum_montsqr_p384 (uint64_t z[6], uint64_t x[6]);
+extern void bignum_montsqr_p384_alt (uint64_t z[6], uint64_t x[6]);
+
+/*  Montgomery square, z := (x^2 / 2^576) mod p_521 */
+/*  Input x[9]; output z[9] */
+extern void bignum_montsqr_p521 (uint64_t z[9], uint64_t x[9]);
+extern void bignum_montsqr_p521_alt (uint64_t z[9], uint64_t x[9]);
+
+/*  Montgomery square, z := (x^2 / 2^256) mod p_sm2 */
+/*  Input x[4]; output z[4] */
+extern void bignum_montsqr_sm2 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_montsqr_sm2_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Multiply z := x * y */
+/*  Inputs x[m], y[n]; output z[k] */
+extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+/*  Multiply z := x * y */
+/*  Inputs x[4], y[4]; output z[8] */
+extern void bignum_mul_4_8 (uint64_t z[8], uint64_t x[4], uint64_t y[4]);
+extern void bignum_mul_4_8_alt (uint64_t z[8], uint64_t x[4], uint64_t y[4]);
+
+/*  Multiply z := x * y */
+/*  Inputs x[6], y[6]; output z[12] */
+extern void bignum_mul_6_12 (uint64_t z[12], uint64_t x[6], uint64_t y[6]);
+extern void bignum_mul_6_12_alt (uint64_t z[12], uint64_t x[6], uint64_t y[6]);
+
+/*  Multiply z := x * y */
+/*  Inputs x[8], y[8]; output z[16] */
+extern void bignum_mul_8_16 (uint64_t z[16], uint64_t x[8], uint64_t y[8]);
+extern void bignum_mul_8_16_alt (uint64_t z[16], uint64_t x[8], uint64_t y[8]);
+
+/*  Multiply modulo p_25519, z := (x * y) mod p_25519 */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_mul_p25519 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+extern void bignum_mul_p25519_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Multiply modulo p_256k1, z := (x * y) mod p_256k1 */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_mul_p256k1 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+extern void bignum_mul_p256k1_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced */
+/*  Inputs x[9], y[9]; output z[9] */
+extern void bignum_mul_p521 (uint64_t z[9], uint64_t x[9], uint64_t y[9]);
+extern void bignum_mul_p521_alt (uint64_t z[9], uint64_t x[9], uint64_t y[9]);
+
+/*  Multiply bignum by 10 and add word: z := 10 * z + d */
+/*  Inputs z[k], d; outputs function return (carry) and z[k] */
+extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d);
+
+/*  Multiplex/select z := x (if p nonzero) or z := y (if p zero) */
+/*  Inputs p, x[k], y[k]; output z[k] */
+extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y);
+
+/*  256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) */
+/*  Inputs p, x[4], y[4]; output z[4] */
+extern void bignum_mux_4 (uint64_t p, uint64_t z[4],uint64_t x[4], uint64_t y[4]);
+
+/*  384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) */
+/*  Inputs p, x[6], y[6]; output z[6] */
+extern void bignum_mux_6 (uint64_t p, uint64_t z[6],uint64_t x[6], uint64_t y[6]);
+
+/*  Select element from 16-element table, z := xs[k*i] */
+/*  Inputs xs[16*k], i; output z[k] */
+extern void bignum_mux16 (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i);
+
+/*  Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_neg_p25519 (uint64_t z[4], uint64_t x[4]);
+
+/*  Negate modulo p_256, z := (-x) mod p_256, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_neg_p256 (uint64_t z[4], uint64_t x[4]);
+
+/*  Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_neg_p256k1 (uint64_t z[4], uint64_t x[4]);
+
+/*  Negate modulo p_384, z := (-x) mod p_384, assuming x reduced */
+/*  Input x[6]; output z[6] */
+extern void bignum_neg_p384 (uint64_t z[6], uint64_t x[6]);
+
+/*  Negate modulo p_521, z := (-x) mod p_521, assuming x reduced */
+/*  Input x[9]; output z[9] */
+extern void bignum_neg_p521 (uint64_t z[9], uint64_t x[9]);
+
+/*  Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced */
+/*  Input x[4]; output z[4] */
+extern void bignum_neg_sm2 (uint64_t z[4], uint64_t x[4]);
+
+/*  Negated modular inverse, z := (-1/x) mod 2^{64k} */
+/*  Input x[k]; output z[k] */
+extern void bignum_negmodinv (uint64_t k, uint64_t *z, uint64_t *x);
+
+/*  Test bignum for nonzero-ness x =/= 0 */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x);
+
+/*  Test 256-bit bignum for nonzero-ness x =/= 0 */
+/*  Input x[4]; output function return */
+extern uint64_t bignum_nonzero_4(uint64_t x[4]);
+
+/*  Test 384-bit bignum for nonzero-ness x =/= 0 */
+/*  Input x[6]; output function return */
+extern uint64_t bignum_nonzero_6(uint64_t x[6]);
+
+/*  Normalize bignum in-place by shifting left till top bit is 1 */
+/*  Input z[k]; outputs function return (bits shifted left) and z[k] */
+extern uint64_t bignum_normalize (uint64_t k, uint64_t *z);
+
+/*  Test bignum for odd-ness */
+/*  Input x[k]; output function return */
+extern uint64_t bignum_odd (uint64_t k, uint64_t *x);
+
+/*  Convert single digit to bignum, z := n */
+/*  Input n; output z[k] */
+extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n);
+
+/*  Optionally add, z := x + y (if p nonzero) or z := x (if p zero) */
+/*  Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] */
+extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+
+/*  Optionally negate, z := -x (if p nonzero) or z := x (if p zero) */
+/*  Inputs p, x[k]; outputs function return (nonzero input) and z[k] */
+extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x);
+
+/*  Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or z := x (if p zero), assuming x reduced */
+/*  Inputs p, x[4]; output z[4] */
+extern void bignum_optneg_p25519 (uint64_t z[4], uint64_t p, uint64_t x[4]);
+
+/*  Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or z := x (if p zero), assuming x reduced */
+/*  Inputs p, x[4]; output z[4] */
+extern void bignum_optneg_p256 (uint64_t z[4], uint64_t p, uint64_t x[4]);
+
+/*  Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or z := x (if p zero), assuming x reduced */
+/*  Inputs p, x[4]; output z[4] */
+extern void bignum_optneg_p256k1 (uint64_t z[4], uint64_t p, uint64_t x[4]);
+
+/*  Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or z := x (if p zero), assuming x reduced */
+/*  Inputs p, x[6]; output z[6] */
+extern void bignum_optneg_p384 (uint64_t z[6], uint64_t p, uint64_t x[6]);
+
+/*  Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or z := x (if p zero), assuming x reduced */
+/*  Inputs p, x[9]; output z[9] */
+extern void bignum_optneg_p521 (uint64_t z[9], uint64_t p, uint64_t x[9]);
+
+/*  Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or z := x (if p zero), assuming x reduced */
+/*  Inputs p, x[4]; output z[4] */
+extern void bignum_optneg_sm2 (uint64_t z[4], uint64_t p, uint64_t x[4]);
+
+/*  Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero) */
+/*  Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] */
+extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+
+/*  Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed */
+/*  Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] */
+extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+
+/*  Return bignum of power of 2, z := 2^n */
+/*  Input n; output z[k] */
+extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n);
+
+/*  Shift bignum left by c < 64 bits z := x * 2^c */
+/*  Inputs x[n], c; outputs function return (carry-out) and z[k] */
+extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+
+/*  Shift bignum right by c < 64 bits z := floor(x / 2^c) */
+/*  Inputs x[n], c; outputs function return (bits shifted out) and z[k] */
+extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+
+/*  Square, z := x^2 */
+/*  Input x[n]; output z[k] */
+extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+
+/*  Square, z := x^2 */
+/*  Input x[4]; output z[8] */
+extern void bignum_sqr_4_8 (uint64_t z[8], uint64_t x[4]);
+extern void bignum_sqr_4_8_alt (uint64_t z[8], uint64_t x[4]);
+
+/*  Square, z := x^2 */
+/*  Input x[6]; output z[12] */
+extern void bignum_sqr_6_12 (uint64_t z[12], uint64_t x[6]);
+extern void bignum_sqr_6_12_alt (uint64_t z[12], uint64_t x[6]);
+
+/*  Square, z := x^2 */
+/*  Input x[8]; output z[16] */
+extern void bignum_sqr_8_16 (uint64_t z[16], uint64_t x[8]);
+extern void bignum_sqr_8_16_alt (uint64_t z[16], uint64_t x[8]);
+
+/*  Square modulo p_25519, z := (x^2) mod p_25519 */
+/*  Input x[4]; output z[4] */
+extern void bignum_sqr_p25519 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_sqr_p25519_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Square modulo p_256k1, z := (x^2) mod p_256k1 */
+/*  Input x[4]; output z[4] */
+extern void bignum_sqr_p256k1 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_sqr_p256k1_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Square modulo p_521, z := (x^2) mod p_521, assuming x reduced */
+/*  Input x[9]; output z[9] */
+extern void bignum_sqr_p521 (uint64_t z[9], uint64_t x[9]);
+extern void bignum_sqr_p521_alt (uint64_t z[9], uint64_t x[9]);
+
+/* Square root modulo p_25519 */
+/* Input x[4]; output function return (Legendre symbol) and z[4] */
+extern int64_t bignum_sqrt_p25519(uint64_t z[4],uint64_t x[4]);
+extern int64_t bignum_sqrt_p25519_alt(uint64_t z[4],uint64_t x[4]);
+
+/*  Subtract, z := x - y */
+/*  Inputs x[m], y[n]; outputs function return (carry-out) and z[p] */
+extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+/*  Subtract modulo p_25519, z := (x - y) mod p_25519, assuming x and y reduced */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_sub_p25519 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Subtract modulo p_256, z := (x - y) mod p_256, assuming x and y reduced */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_sub_p256 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Subtract modulo p_256k1, z := (x - y) mod p_256k1, assuming x and y reduced */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_sub_p256k1 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Subtract modulo p_384, z := (x - y) mod p_384, assuming x and y reduced */
+/*  Inputs x[6], y[6]; output z[6] */
+extern void bignum_sub_p384 (uint64_t z[6], uint64_t x[6], uint64_t y[6]);
+
+/*  Subtract modulo p_521, z := (x - y) mod p_521, assuming x and y reduced */
+/*  Inputs x[9], y[9]; output z[9] */
+extern void bignum_sub_p521 (uint64_t z[9], uint64_t x[9], uint64_t y[9]);
+
+/*  Subtract modulo p_sm2, z := (x - y) mod p_sm2, assuming x and y reduced */
+/*  Inputs x[4], y[4]; output z[4] */
+extern void bignum_sub_sm2 (uint64_t z[4], uint64_t x[4], uint64_t y[4]);
+
+/*  Convert 4-digit (256-bit) bignum to big-endian bytes */
+/*  Input x[4]; output z[32] (bytes) */
+extern void bignum_tobebytes_4 (uint8_t z[32], uint64_t x[4]);
+
+/*  Convert 6-digit (384-bit) bignum to big-endian bytes */
+/*  Input x[6]; output z[48] (bytes) */
+extern void bignum_tobebytes_6 (uint8_t z[48], uint64_t x[6]);
+
+/*  Convert 4-digit (256-bit) bignum to little-endian bytes */
+/*  Input x[4]; output z[32] (bytes) */
+extern void bignum_tolebytes_4 (uint8_t z[32], uint64_t x[4]);
+
+/*  Convert 6-digit (384-bit) bignum to little-endian bytes */
+/*  Input x[6]; output z[48] (bytes) */
+extern void bignum_tolebytes_6 (uint8_t z[48], uint64_t x[6]);
+
+/* Convert 9-digit 528-bit bignum to little-endian bytes    */
+/* Input x[6]; output z[66] (bytes)                         */
+extern void bignum_tolebytes_p521 (uint8_t z[66], uint64_t x[9]);
+
+/*  Convert to Montgomery form z := (2^256 * x) mod p_256 */
+/*  Input x[4]; output z[4] */
+extern void bignum_tomont_p256 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_tomont_p256_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Convert to Montgomery form z := (2^256 * x) mod p_256k1 */
+/*  Input x[4]; output z[4] */
+extern void bignum_tomont_p256k1 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_tomont_p256k1_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Convert to Montgomery form z := (2^384 * x) mod p_384 */
+/*  Input x[6]; output z[6] */
+extern void bignum_tomont_p384 (uint64_t z[6], uint64_t x[6]);
+extern void bignum_tomont_p384_alt (uint64_t z[6], uint64_t x[6]);
+
+/*  Convert to Montgomery form z := (2^576 * x) mod p_521 */
+/*  Input x[9]; output z[9] */
+extern void bignum_tomont_p521 (uint64_t z[9], uint64_t x[9]);
+
+/* Convert to Montgomery form z := (2^256 * x) mod p_sm2 */
+/* Input x[4]; output z[4] */
+extern void bignum_tomont_sm2 (uint64_t z[4], uint64_t x[4]);
+
+/*  Triple modulo p_256, z := (3 * x) mod p_256 */
+/*  Input x[4]; output z[4] */
+extern void bignum_triple_p256 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_triple_p256_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Triple modulo p_256k1, z := (3 * x) mod p_256k1 */
+/*  Input x[4]; output z[4] */
+extern void bignum_triple_p256k1 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_triple_p256k1_alt (uint64_t z[4], uint64_t x[4]);
+
+/*  Triple modulo p_384, z := (3 * x) mod p_384 */
+/*  Input x[6]; output z[6] */
+extern void bignum_triple_p384 (uint64_t z[6], uint64_t x[6]);
+extern void bignum_triple_p384_alt (uint64_t z[6], uint64_t x[6]);
+
+/*  Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced */
+/*  Input x[9]; output z[9] */
+extern void bignum_triple_p521 (uint64_t z[9], uint64_t x[9]);
+extern void bignum_triple_p521_alt (uint64_t z[9], uint64_t x[9]);
+
+/*  Triple modulo p_sm2, z := (3 * x) mod p_sm2 */
+/*  Input x[4]; output z[4] */
+extern void bignum_triple_sm2 (uint64_t z[4], uint64_t x[4]);
+extern void bignum_triple_sm2_alt (uint64_t z[4], uint64_t x[4]);
+
+/* Montgomery ladder step for curve25519 */
+/* Inputs point[8], pp[16], b; output rr[16] */
+extern void curve25519_ladderstep(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b);
+extern void curve25519_ladderstep_alt(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b);
+
+/* Projective scalar multiplication, x coordinate only, for curve25519 */
+/* Inputs scalar[4], point[4]; output res[8] */
+extern void curve25519_pxscalarmul(uint64_t res[8],uint64_t scalar[4],uint64_t point[4]);
+extern void curve25519_pxscalarmul_alt(uint64_t res[8],uint64_t scalar[4],uint64_t point[4]);
+
+/* x25519 function for curve25519 */
+/* Inputs scalar[4], point[4]; output res[4] */
+extern void curve25519_x25519(uint64_t res[4],uint64_t scalar[4],uint64_t point[4]);
+extern void curve25519_x25519_alt(uint64_t res[4],uint64_t scalar[4],uint64_t point[4]);
+
+/* x25519 function for curve25519 (byte array arguments) */
+/* Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes) */
+extern void curve25519_x25519_byte(uint8_t res[32],uint8_t scalar[32],uint8_t point[32]);
+extern void curve25519_x25519_byte_alt(uint8_t res[32],uint8_t scalar[32],uint8_t point[32]);
+
+/* x25519 function for curve25519 on base element 9 */
+/* Input scalar[4]; output res[4] */
+extern void curve25519_x25519base(uint64_t res[4],uint64_t scalar[4]);
+extern void curve25519_x25519base_alt(uint64_t res[4],uint64_t scalar[4]);
+
+/* x25519 function for curve25519 on base element 9 (byte array arguments) */
+/* Input scalar[32] (bytes); output res[32] (bytes) */
+extern void curve25519_x25519base_byte(uint8_t res[32],uint8_t scalar[32]);
+extern void curve25519_x25519base_byte_alt(uint8_t res[32],uint8_t scalar[32]);
+
+/* Decode compressed 256-bit form of edwards25519 point */
+/* Input c[32] (bytes); output function return and z[8] */
+extern uint64_t edwards25519_decode(uint64_t z[8],uint8_t c[32]);
+extern uint64_t edwards25519_decode_alt(uint64_t z[8],uint8_t c[32]);
+
+/* Encode edwards25519 point into compressed form as 256-bit number */
+/* Input p[8]; output z[32] (bytes) */
+extern void edwards25519_encode(uint8_t z[32], uint64_t p[8]);
+
+/* Extended projective addition for edwards25519 */
+/* Inputs p1[16], p2[16]; output p3[16] */
+extern void edwards25519_epadd(uint64_t p3[16],uint64_t p1[16],uint64_t p2[16]);
+extern void edwards25519_epadd_alt(uint64_t p3[16],uint64_t p1[16],uint64_t p2[16]);
+
+/* Extended projective doubling for edwards25519 */
+/* Inputs p1[12]; output p3[16] */
+extern void edwards25519_epdouble(uint64_t p3[16],uint64_t p1[12]);
+extern void edwards25519_epdouble_alt(uint64_t p3[16],uint64_t p1[12]);
+
+/* Projective doubling for edwards25519 */
+/* Inputs p1[12]; output p3[12] */
+extern void edwards25519_pdouble(uint64_t p3[12],uint64_t p1[12]);
+extern void edwards25519_pdouble_alt(uint64_t p3[12],uint64_t p1[12]);
+
+/* Extended projective + precomputed mixed addition for edwards25519 */
+/* Inputs p1[16], p2[12]; output p3[16] */
+extern void edwards25519_pepadd(uint64_t p3[16],uint64_t p1[16],uint64_t p2[12]);
+extern void edwards25519_pepadd_alt(uint64_t p3[16],uint64_t p1[16],uint64_t p2[12]);
+
+/* Scalar multiplication by standard basepoint for edwards25519 (Ed25519) */
+/* Input scalar[4]; output res[8] */
+extern void edwards25519_scalarmulbase(uint64_t res[8],uint64_t scalar[4]);
+extern void edwards25519_scalarmulbase_alt(uint64_t res[8],uint64_t scalar[4]);
+
+/* Double scalar multiplication for edwards25519, fresh and base point */
+/* Input scalar[4], point[8], bscalar[4]; output res[8] */
+extern void edwards25519_scalarmuldouble(uint64_t res[8],uint64_t scalar[4], uint64_t point[8],uint64_t bscalar[4]);
+extern void edwards25519_scalarmuldouble_alt(uint64_t res[8],uint64_t scalar[4], uint64_t point[8],uint64_t bscalar[4]);
+
+/* Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates */
+/* Inputs p1[12], p2[12]; output p3[12] */
+extern void p256_montjadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]);
+extern void p256_montjadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]);
+
+/* Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates */
+/* Inputs p1[12]; output p3[12] */
+extern void p256_montjdouble(uint64_t p3[12],uint64_t p1[12]);
+extern void p256_montjdouble_alt(uint64_t p3[12],uint64_t p1[12]);
+
+/* Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates */
+/* Inputs p1[12], p2[8]; output p3[12] */
+extern void p256_montjmixadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]);
+extern void p256_montjmixadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]);
+
+/* Montgomery-Jacobian form scalar multiplication for P-256 */
+/* Input scalar[4], point[12]; output res[12] */
+extern void p256_montjscalarmul(uint64_t res[12],uint64_t scalar[4],uint64_t point[12]);
+extern void p256_montjscalarmul_alt(uint64_t res[12],uint64_t scalar[4],uint64_t point[12]);
+
+/* Scalar multiplication for NIST curve P-256 */
+/* Input scalar[4], point[8]; output res[8] */
+extern void p256_scalarmul(uint64_t res[8],uint64_t scalar[4],uint64_t point[8]);
+extern void p256_scalarmul_alt(uint64_t res[8],uint64_t scalar[4],uint64_t point[8]);
+
+/* Scalar multiplication for precomputed point on NIST curve P-256 */
+/* Input scalar[4], blocksize, table[]; output res[8] */
+extern void p256_scalarmulbase(uint64_t res[8],uint64_t scalar[4],uint64_t blocksize,uint64_t *table);
+extern void p256_scalarmulbase_alt(uint64_t res[8],uint64_t scalar[4],uint64_t blocksize,uint64_t *table);
+
+/* Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates */
+/* Inputs p1[18], p2[18]; output p3[18] */
+extern void p384_montjadd(uint64_t p3[18],uint64_t p1[18],uint64_t p2[18]);
+extern void p384_montjadd_alt(uint64_t p3[18],uint64_t p1[18],uint64_t p2[18]);
+
+/* Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates */
+/* Inputs p1[18]; output p3[18] */
+extern void p384_montjdouble(uint64_t p3[18],uint64_t p1[18]);
+extern void p384_montjdouble_alt(uint64_t p3[18],uint64_t p1[18]);
+
+/* Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates */
+/* Inputs p1[18], p2[12]; output p3[18] */
+extern void p384_montjmixadd(uint64_t p3[18],uint64_t p1[18],uint64_t p2[12]);
+extern void p384_montjmixadd_alt(uint64_t p3[18],uint64_t p1[18],uint64_t p2[12]);
+
+/* Montgomery-Jacobian form scalar multiplication for P-384 */
+/* Input scalar[6], point[18]; output res[18] */
+extern void p384_montjscalarmul(uint64_t res[18],uint64_t scalar[6],uint64_t point[18]);
+extern void p384_montjscalarmul_alt(uint64_t res[18],uint64_t scalar[6],uint64_t point[18]);
+
+/* Point addition on NIST curve P-521 in Jacobian coordinates */
+/* Inputs p1[27], p2[27]; output p3[27] */
+extern void p521_jadd(uint64_t p3[27],uint64_t p1[27],uint64_t p2[27]);
+extern void p521_jadd_alt(uint64_t p3[27],uint64_t p1[27],uint64_t p2[27]);
+
+/* Point doubling on NIST curve P-521 in Jacobian coordinates */
+/* Input p1[27]; output p3[27] */
+extern void p521_jdouble(uint64_t p3[27],uint64_t p1[27]);
+extern void p521_jdouble_alt(uint64_t p3[27],uint64_t p1[27]);
+
+/* Point mixed addition on NIST curve P-521 in Jacobian coordinates */
+/* Inputs p1[27], p2[18]; output p3[27] */
+extern void p521_jmixadd(uint64_t p3[27],uint64_t p1[27],uint64_t p2[18]);
+extern void p521_jmixadd_alt(uint64_t p3[27],uint64_t p1[27],uint64_t p2[18]);
+
+/* Jacobian form scalar multiplication for P-521 */
+/*  Input scalar[9], point[27]; output res[27] */
+extern void p521_jscalarmul(uint64_t res[27],uint64_t scalar[9],uint64_t point[27]);
+extern void p521_jscalarmul_alt(uint64_t res[27],uint64_t scalar[9],uint64_t point[27]);
+
+/* Point addition on SECG curve secp256k1 in Jacobian coordinates */
+/* Inputs p1[12], p2[12]; output p3[12] */
+extern void secp256k1_jadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]);
+extern void secp256k1_jadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]);
+
+/* Point doubling on SECG curve secp256k1 in Jacobian coordinates */
+/* Input p1[12]; output p3[12] */
+extern void secp256k1_jdouble(uint64_t p3[12],uint64_t p1[12]);
+extern void secp256k1_jdouble_alt(uint64_t p3[12],uint64_t p1[12]);
+
+/* Point mixed addition on SECG curve secp256k1 in Jacobian coordinates */
+/* Inputs p1[12], p2[8]; output p3[12] */
+extern void secp256k1_jmixadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]);
+extern void secp256k1_jmixadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]);
+
+/* Point addition on CC curve SM2 in Montgomery-Jacobian coordinates */
+/* Inputs p1[12], p2[12]; output p3[12] */
+extern void sm2_montjadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]);
+extern void sm2_montjadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]);
+
+/* Point doubling on CC curve SM2 in Montgomery-Jacobian coordinates */
+/* Inputs p1[12]; output p3[12] */
+extern void sm2_montjdouble(uint64_t p3[12],uint64_t p1[12]);
+extern void sm2_montjdouble_alt(uint64_t p3[12],uint64_t p1[12]);
+
+/* Point mixed addition on CC curve SM2 in Montgomery-Jacobian coordinates */
+/* Inputs p1[12], p2[8]; output p3[12] */
+extern void sm2_montjmixadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]);
+extern void sm2_montjmixadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]);
+
+/* Montgomery-Jacobian form scalar multiplication for CC curve SM2 */
+/* Input scalar[4], point[12]; output res[12] */
+extern void sm2_montjscalarmul(uint64_t res[12],uint64_t scalar[4],uint64_t point[12]);
+extern void sm2_montjscalarmul_alt(uint64_t res[12],uint64_t scalar[4],uint64_t point[12]);
+
+/*  Reverse the bytes in a single word */
+/*  Input a; output function return */
+extern uint64_t word_bytereverse (uint64_t a);
+
+/*  Count leading zero bits in a single word */
+/*  Input a; output function return */
+extern uint64_t word_clz (uint64_t a);
+
+/*  Count trailing zero bits in a single word */
+/*  Input a; output function return */
+extern uint64_t word_ctz (uint64_t a);
+
+/* Perform 59 "divstep" iterations and return signed matrix of updates */
+/* Inputs d, f, g; output m[2][2] and function return */
+extern int64_t word_divstep59(int64_t m[2][2],int64_t d,uint64_t f,uint64_t g);
+
+/*  Return maximum of two unsigned 64-bit words */
+/*  Inputs a, b; output function return */
+extern uint64_t word_max (uint64_t a, uint64_t b);
+
+/*  Return minimum of two unsigned 64-bit words */
+/*  Inputs a, b; output function return */
+extern uint64_t word_min (uint64_t a, uint64_t b);
+
+/*  Single-word negated modular inverse (-1/a) mod 2^64 */
+/*  Input a; output function return */
+extern uint64_t word_negmodinv (uint64_t a);
+
+/* Count number of set bits in a single 64-bit word (population count) */
+/* Input a; output function return */
+extern uint64_t word_popcount (uint64_t a);
+
+/*  Single-word reciprocal, 2^64 + ret = ceil(2^128/a) - 1 if MSB of "a" is set */
+/*  Input a; output function return */
+extern uint64_t word_recip (uint64_t a);
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h
new file mode 100644
index 00000000000..faecfec52a2
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h
@@ -0,0 +1,1120 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// C prototypes for s2n-bignum functions, so you can use them in C programs via
+//
+//  #include "s2n-bignum.h"
+//
+// The functions are listed in alphabetical order with a brief description
+// in comments for each one. For more detailed documentation see the comment
+// banner at the top of the corresponding assembly (.S) file, and
+// for the last word in what properties it satisfies see the spec in the
+// formal proof (the .ml file in the architecture-specific directory).
+//
+// For some functions there are additional variants with names ending in
+// "_alt". These have the same core mathematical functionality as their
+// non-"alt" versions, but can be better suited to some microarchitectures:
+//
+//      - On x86, the "_alt" forms avoid BMI and ADX instruction set
+//        extensions, so will run on any x86_64 machine, even older ones
+//
+//      - On ARM, the "_alt" forms target machines with higher multiplier
+//        throughput, generally offering higher performance there.
+// ----------------------------------------------------------------------------
+
+
+#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__)
+#define S2N_BIGNUM_STATIC
+#else
+#define S2N_BIGNUM_STATIC static
+#endif
+
+// Add, z := x + y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+
+// Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_add_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_add_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_add_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
+// Inputs x[6], y[6]; output z[6]
+extern void bignum_add_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
+
+// Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced
+// Inputs x[9], y[9]; output z[9]
+extern void bignum_add_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+
+// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_add_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Compute "amontification" constant z :== 2^{128k} (congruent mod m)
+// Input m[k]; output z[k]; temporary buffer t[>=k]
+extern void bignum_amontifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
+
+// Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m)
+// Inputs x[k], y[k], m[k]; output z[k]
+extern void bignum_amontmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
+
+// Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m)
+// Inputs x[n], m[k], p; output z[k]
+extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
+
+// Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m)
+// Inputs x[k], m[k]; output z[k]
+extern void bignum_amontsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
+
+// Convert 4-digit (256-bit) bignum to/from big-endian form
+// Input x[4]; output z[4]
+extern void bignum_bigendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert 6-digit (384-bit) bignum to/from big-endian form
+// Input x[6]; output z[6]
+extern void bignum_bigendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Select bitfield starting at bit n with length l <= 64
+// Inputs x[k], n, l; output function return
+extern uint64_t bignum_bitfield (uint64_t k, const uint64_t *x, uint64_t n, uint64_t l);
+
+// Return size of bignum in bits
+// Input x[k]; output function return
+extern uint64_t bignum_bitsize (uint64_t k, const uint64_t *x);
+
+// Divide by a single (nonzero) word, z := x / m and return x mod m
+// Inputs x[n], m; outputs function return (remainder) and z[k]
+extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
+
+// Divide by a single word, z := x / m when known to be exact
+// Inputs x[n], m; output z[k]
+extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m);
+
+// Count leading zero digits (64-bit words)
+// Input x[k]; output function return
+extern uint64_t bignum_cld (uint64_t k, const uint64_t *x);
+
+// Count leading zero bits
+// Input x[k]; output function return
+extern uint64_t bignum_clz (uint64_t k, const uint64_t *x);
+
+// Multiply-add with single-word multiplier, z := z + c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
+
+// Negated multiply-add with single-word multiplier, z := z - c * y
+// Inputs c, y[n]; outputs function return (negative carry-out) and z[k]
+extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
+
+// Find modulus of bignum w.r.t. single nonzero word m, returning x mod m
+// Input x[k], m; output function return
+extern uint64_t bignum_cmod (uint64_t k, const uint64_t *x, uint64_t m);
+
+// Multiply by a single word, z := c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y);
+
+// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming x reduced
+// Inputs c, x[4]; output z[4]
+extern void bignum_cmul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_cmul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming x reduced
+// Inputs c, x[4]; output z[4]
+extern void bignum_cmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_cmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming x reduced
+// Inputs c, x[4]; output z[4]
+extern void bignum_cmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_cmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming x reduced
+// Inputs c, x[6]; output z[6]
+extern void bignum_cmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
+extern void bignum_cmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming x reduced
+// Inputs c, x[9]; output z[9]
+extern void bignum_cmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
+extern void bignum_cmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming x reduced
+// Inputs c, x[4]; output z[4]
+extern void bignum_cmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_cmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Test bignums for coprimality, gcd(x,y) = 1
+// Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)]
+extern uint64_t bignum_coprime (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y, uint64_t *t);
+
+// Copy bignum with zero-extension or truncation, z := x
+// Input x[n]; output z[k]
+extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
+
+// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
+// into z[0..width-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*width]; output z[width]
+extern void bignum_copy_row_from_table (uint64_t *z, const uint64_t *table, uint64_t height,
+        uint64_t width, uint64_t idx);
+
+// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
+// into z[0..width-1]. width must be a multiple of 8.
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*width]; output z[width]
+extern void bignum_copy_row_from_table_8n (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t width, uint64_t idx);
+
+// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*16]; output z[16]
+extern void bignum_copy_row_from_table_16 (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t idx);
+
+// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1].
+// This function is constant-time with respect to the value of `idx`. This is
+// achieved by reading the whole table and using the bit-masking to get the
+// `idx`-th row.
+// Input table[height*32]; output z[32]
+extern void bignum_copy_row_from_table_32 (uint64_t *z, const uint64_t *table,
+        uint64_t height, uint64_t idx);
+
+// Count trailing zero digits (64-bit words)
+// Input x[k]; output function return
+extern uint64_t bignum_ctd (uint64_t k, const uint64_t *x);
+
+// Count trailing zero bits
+// Input x[k]; output function return
+extern uint64_t bignum_ctz (uint64_t k, const uint64_t *x);
+
+// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256
+// Input x[4]; output z[4]
+extern void bignum_deamont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_deamont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256k1
+// Input x[4]; output z[4]
+extern void bignum_deamont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert from almost-Montgomery form, z := (x / 2^384) mod p_384
+// Input x[6]; output z[6]
+extern void bignum_deamont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+extern void bignum_deamont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Convert from almost-Montgomery form z := (x / 2^576) mod p_521
+// Input x[9]; output z[9]
+extern void bignum_deamont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Convert from almost-Montgomery form z := (x / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_deamont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m
+// Inputs x[k], m[k]; output z[k]
+extern void bignum_demont (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
+
+// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_demont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_demont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert from Montgomery form z := (x / 2^256) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_demont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced
+// Input x[6]; output z[6]
+extern void bignum_demont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+extern void bignum_demont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced
+// Input x[9]; output z[9]
+extern void bignum_demont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_demont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Select digit x[n]
+// Inputs x[k], n; output function return
+extern uint64_t bignum_digit (uint64_t k, const uint64_t *x, uint64_t n);
+
+// Return size of bignum in digits (64-bit word)
+// Input x[k]; output function return
+extern uint64_t bignum_digitsize (uint64_t k, const uint64_t *x);
+
+// Divide bignum by 10: z' := z div 10, returning remainder z mod 10
+// Inputs z[k]; outputs function return (remainder) and z[k]
+extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z);
+
+// Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_double_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_double_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_double_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced
+// Input x[6]; output z[6]
+extern void bignum_double_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced
+// Input x[9]; output z[9]
+extern void bignum_double_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_double_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Extended Montgomery reduce, returning results in input-output buffer
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
+
+// Extended Montgomery reduce in 8-digit blocks, results in input-output buffer
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w);
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+// Temporary buffer m_precalc[12*(k/4-1)]
+extern uint64_t bignum_emontredc_8n_cdiff (uint64_t k, uint64_t *z, const uint64_t *m,
+                                          uint64_t w, uint64_t *m_precalc);
+
+// Test bignums for equality, x = y
+// Inputs x[m], y[n]; output function return
+extern uint64_t bignum_eq (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+
+// Test bignum for even-ness
+// Input x[k]; output function return
+extern uint64_t bignum_even (uint64_t k, const uint64_t *x);
+
+// Convert 4-digit (256-bit) bignum from big-endian bytes
+// Input x[32] (bytes); output z[4]
+extern void bignum_frombebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
+
+// Convert 6-digit (384-bit) bignum from big-endian bytes
+// Input x[48] (bytes); output z[6]
+extern void bignum_frombebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
+
+// Convert 4-digit (256-bit) bignum from little-endian bytes
+// Input x[32] (bytes); output z[4]
+extern void bignum_fromlebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]);
+
+// Convert 6-digit (384-bit) bignum from little-endian bytes
+// Input x[48] (bytes); output z[6]
+extern void bignum_fromlebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
+
+// Convert little-endian bytes to 9-digit 528-bit bignum
+// Input x[66] (bytes); output z[9]
+extern void bignum_fromlebytes_p521 (uint64_t z[S2N_BIGNUM_STATIC 9],const uint8_t x[S2N_BIGNUM_STATIC 66]);
+
+// Compare bignums, x >= y
+// Inputs x[m], y[n]; output function return
+extern uint64_t bignum_ge (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+
+// Compare bignums, x > y
+// Inputs x[m], y[n]; output function return
+extern uint64_t bignum_gt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+
+// Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_half_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_half_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced
+// Input x[6]; output z[6]
+extern void bignum_half_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced
+// Input x[9]; output z[9]
+extern void bignum_half_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_half_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Modular inverse modulo p_25519 = 2^255 - 19
+// Input x[4]; output z[4]
+extern void bignum_inv_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+extern void bignum_inv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+extern void bignum_inv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Modular inverse modulo p_521 = 2^521 - 1
+// Input x[9]; output z[9]
+extern void bignum_inv_p521(uint64_t z[S2N_BIGNUM_STATIC 9],const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+extern void bignum_inv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Inverse square root modulo p_25519
+// Input x[4]; output function return (Legendre symbol) and z[4]
+extern int64_t bignum_invsqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Test bignum for zero-ness, x = 0
+// Input x[k]; output function return
+extern uint64_t bignum_iszero (uint64_t k, const uint64_t *x);
+
+// Multiply z := x * y
+// Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32]
+extern void bignum_kmul_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], const uint64_t y[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 32]);
+
+// Multiply z := x * y
+// Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96]
+extern void bignum_kmul_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], const uint64_t y[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 96]);
+
+// Square, z := x^2
+// Input x[16]; output z[32]; temporary buffer t[>=24]
+extern void bignum_ksqr_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 24]);
+
+// Square, z := x^2
+// Input x[32]; output z[64]; temporary buffer t[>=72]
+extern void bignum_ksqr_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 72]);
+
+// Compare bignums, x <= y
+// Inputs x[m], y[n]; output function return
+extern uint64_t bignum_le (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+
+// Convert 4-digit (256-bit) bignum to/from little-endian form
+// Input x[4]; output z[4]
+extern void bignum_littleendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert 6-digit (384-bit) bignum to/from little-endian form
+// Input x[6]; output z[6]
+extern void bignum_littleendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Compare bignums, x < y
+// Inputs x[m], y[n]; output function return
+extern uint64_t bignum_lt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+
+// Multiply-add, z := z + x * y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[k]
+extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+
+// Multiply-add modulo the order of the curve25519/edwards25519 basepoint
+// Inputs x[4], y[4], c[4]; output z[4]
+extern void bignum_madd_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
+extern void bignum_madd_n25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]);
+
+// Reduce modulo group order, z := x mod m_25519
+// Input x[4]; output z[4]
+extern void bignum_mod_m25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[k]; output z[4]
+extern void bignum_mod_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[4]; output z[4]
+extern void bignum_mod_n25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Reduce modulo group order, z := x mod n_256
+// Input x[k]; output z[4]
+extern void bignum_mod_n256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+extern void bignum_mod_n256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+
+// Reduce modulo group order, z := x mod n_256
+// Input x[4]; output z[4]
+extern void bignum_mod_n256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Reduce modulo group order, z := x mod n_256k1
+// Input x[4]; output z[4]
+extern void bignum_mod_n256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Reduce modulo group order, z := x mod n_384
+// Input x[k]; output z[6]
+extern void bignum_mod_n384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
+extern void bignum_mod_n384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
+
+// Reduce modulo group order, z := x mod n_384
+// Input x[6]; output z[6]
+extern void bignum_mod_n384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Reduce modulo group order, z := x mod n_521
+// Input x[9]; output z[9]
+extern void bignum_mod_n521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+extern void bignum_mod_n521_9_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[k]; output z[4]
+extern void bignum_mod_nsm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+extern void bignum_mod_nsm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[4]; output z[4]
+extern void bignum_mod_nsm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Reduce modulo field characteristic, z := x mod p_25519
+// Input x[4]; output z[4]
+extern void bignum_mod_p25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Reduce modulo field characteristic, z := x mod p_256
+// Input x[k]; output z[4]
+extern void bignum_mod_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+extern void bignum_mod_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+
+// Reduce modulo field characteristic, z := x mod p_256
+// Input x[4]; output z[4]
+extern void bignum_mod_p256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Reduce modulo field characteristic, z := x mod p_256k1
+// Input x[4]; output z[4]
+extern void bignum_mod_p256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Reduce modulo field characteristic, z := x mod p_384
+// Input x[k]; output z[6]
+extern void bignum_mod_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
+extern void bignum_mod_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x);
+
+// Reduce modulo field characteristic, z := x mod p_384
+// Input x[6]; output z[6]
+extern void bignum_mod_p384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Reduce modulo field characteristic, z := x mod p_521
+// Input x[9]; output z[9]
+extern void bignum_mod_p521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[k]; output z[4]
+extern void bignum_mod_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x);
+
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_mod_sm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Add modulo m, z := (x + y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+extern void bignum_modadd (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
+
+// Double modulo m, z := (2 * x) mod m, assuming x reduced
+// Inputs x[k], m[k]; output z[k]
+extern void bignum_moddouble (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
+
+// Modular exponentiation for arbitrary odd modulus, z := (a^p) mod m
+// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k]
+extern void bignum_modexp(uint64_t k,uint64_t *z, const uint64_t *a,const uint64_t *p,const uint64_t *m,uint64_t *t);
+
+// Compute "modification" constant z := 2^{64k} mod m
+// Input m[k]; output z[k]; temporary buffer t[>=k]
+extern void bignum_modifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
+
+// Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, a coprime to b
+// Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k]
+extern void bignum_modinv (uint64_t k, uint64_t *z, const uint64_t *a, const uint64_t *b, uint64_t *t);
+
+// Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[k], m[k]; output z[k]
+extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x, const uint64_t *m);
+
+// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+extern void bignum_modsub (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
+
+// Compute "montification" constant z := 2^{128k} mod m
+// Input m[k]; output z[k]; temporary buffer t[>=k]
+extern void bignum_montifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t);
+
+// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+extern void bignum_montinv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1
+// Input x[6]; output z[6]
+extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+extern void bignum_montinv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Montgomery multiply, z := (x * y / 2^{64k}) mod m
+// Inputs x[k], y[k], m[k]; output z[k]
+extern void bignum_montmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m);
+
+// Montgomery multiply, z := (x * y / 2^256) mod p_256
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_montmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Montgomery multiply, z := (x * y / 2^256) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_montmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Montgomery multiply, z := (x * y / 2^384) mod p_384
+// Inputs x[6], y[6]; output z[6]
+extern void bignum_montmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
+extern void bignum_montmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
+
+// Montgomery multiply, z := (x * y / 2^576) mod p_521
+// Inputs x[9], y[9]; output z[9]
+extern void bignum_montmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+extern void bignum_montmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+
+// Montgomery multiply, z := (x * y / 2^256) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_montmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Montgomery reduce, z := (x' / 2^{64p}) MOD m
+// Inputs x[n], m[k], p; output z[k]
+extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p);
+
+// Montgomery square, z := (x^2 / 2^{64k}) mod m
+// Inputs x[k], m[k]; output z[k]
+extern void bignum_montsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m);
+
+// Montgomery square, z := (x^2 / 2^256) mod p_256
+// Input x[4]; output z[4]
+extern void bignum_montsqr_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montsqr_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Montgomery square, z := (x^2 / 2^256) mod p_256k1
+// Input x[4]; output z[4]
+extern void bignum_montsqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montsqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Montgomery square, z := (x^2 / 2^384) mod p_384
+// Input x[6]; output z[6]
+extern void bignum_montsqr_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+extern void bignum_montsqr_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Montgomery square, z := (x^2 / 2^576) mod p_521
+// Input x[9]; output z[9]
+extern void bignum_montsqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+extern void bignum_montsqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Montgomery square, z := (x^2 / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_montsqr_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_montsqr_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Multiply z := x * y
+// Inputs x[m], y[n]; output z[k]
+extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+extern void bignum_mul_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+extern void bignum_mul_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+extern void bignum_mul_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
+extern void bignum_mul_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
+
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+extern void bignum_mul_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
+extern void bignum_mul_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]);
+
+// Multiply modulo p_25519, z := (x * y) mod p_25519
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_mul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+extern void bignum_mul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Multiply modulo p_256k1, z := (x * y) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_mul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+extern void bignum_mul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced
+// Inputs x[9], y[9]; output z[9]
+extern void bignum_mul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+extern void bignum_mul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+
+// Multiply bignum by 10 and add word: z := 10 * z + d
+// Inputs z[k], d; outputs function return (carry) and z[k]
+extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d);
+
+// Multiplex/select z := x (if p nonzero) or z := y (if p zero)
+// Inputs p, x[k], y[k]; output z[k]
+extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y);
+
+// 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
+// Inputs p, x[4], y[4]; output z[4]
+extern void bignum_mux_4 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
+// Inputs p, x[6], y[6]; output z[6]
+extern void bignum_mux_6 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
+
+// Select element from 16-element table, z := xs[k*i]
+// Inputs xs[16*k], i; output z[k]
+extern void bignum_mux16 (uint64_t k, uint64_t *z, const uint64_t *xs, uint64_t i);
+
+// Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_neg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Negate modulo p_256, z := (-x) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_neg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_neg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Negate modulo p_384, z := (-x) mod p_384, assuming x reduced
+// Input x[6]; output z[6]
+extern void bignum_neg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Negate modulo p_521, z := (-x) mod p_521, assuming x reduced
+// Input x[9]; output z[9]
+extern void bignum_neg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+extern void bignum_neg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Negated modular inverse, z := (-1/x) mod 2^{64k}
+// Input x[k]; output z[k]
+extern void bignum_negmodinv (uint64_t k, uint64_t *z, const uint64_t *x);
+
+// Test bignum for nonzero-ness x =/= 0
+// Input x[k]; output function return
+extern uint64_t bignum_nonzero (uint64_t k, const uint64_t *x);
+
+// Test 256-bit bignum for nonzero-ness x =/= 0
+// Input x[4]; output function return
+extern uint64_t bignum_nonzero_4(const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Test 384-bit bignum for nonzero-ness x =/= 0
+// Input x[6]; output function return
+extern uint64_t bignum_nonzero_6(const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Normalize bignum in-place by shifting left till top bit is 1
+// Input z[k]; outputs function return (bits shifted left) and z[k]
+extern uint64_t bignum_normalize (uint64_t k, uint64_t *z);
+
+// Test bignum for odd-ness
+// Input x[k]; output function return
+extern uint64_t bignum_odd (uint64_t k, const uint64_t *x);
+
+// Convert single digit to bignum, z := n
+// Input n; output z[k]
+extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n);
+
+// Optionally add, z := x + y (if p nonzero) or z := x (if p zero)
+// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
+extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
+
+// Optionally negate, z := -x (if p nonzero) or z := x (if p zero)
+// Inputs p, x[k]; outputs function return (nonzero input) and z[k]
+extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x);
+
+// Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+extern void bignum_optneg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+extern void bignum_optneg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+extern void bignum_optneg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[6]; output z[6]
+extern void bignum_optneg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[9]; output z[9]
+extern void bignum_optneg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+extern void bignum_optneg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
+// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
+extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
+
+// Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed
+// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
+extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y);
+
+// Return bignum of power of 2, z := 2^n
+// Input n; output z[k]
+extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n);
+
+// Shift bignum left by c < 64 bits z := x * 2^c
+// Inputs x[n], c; outputs function return (carry-out) and z[k]
+extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
+
+// Shift bignum right by c < 64 bits z := floor(x / 2^c)
+// Inputs x[n], c; outputs function return (bits shifted out) and z[k]
+extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c);
+
+// Square, z := x^2
+// Input x[n]; output z[k]
+extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x);
+
+// Square, z := x^2
+// Input x[4]; output z[8]
+extern void bignum_sqr_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_sqr_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Square, z := x^2
+// Input x[6]; output z[12]
+extern void bignum_sqr_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+extern void bignum_sqr_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Square, z := x^2
+// Input x[8]; output z[16]
+extern void bignum_sqr_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
+extern void bignum_sqr_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]);
+
+// Square modulo p_25519, z := (x^2) mod p_25519
+// Input x[4]; output z[4]
+extern void bignum_sqr_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_sqr_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Square modulo p_256k1, z := (x^2) mod p_256k1
+// Input x[4]; output z[4]
+extern void bignum_sqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_sqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Square modulo p_521, z := (x^2) mod p_521, assuming x reduced
+// Input x[9]; output z[9]
+extern void bignum_sqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+extern void bignum_sqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Square root modulo p_25519
+// Input x[4]; output function return (Legendre symbol) and z[4]
+extern int64_t bignum_sqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern int64_t bignum_sqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Subtract, z := x - y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);
+
+// Subtract modulo p_25519, z := (x - y) mod p_25519, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_sub_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Subtract modulo p_256, z := (x - y) mod p_256, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_sub_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Subtract modulo p_256k1, z := (x - y) mod p_256k1, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_sub_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Subtract modulo p_384, z := (x - y) mod p_384, assuming x and y reduced
+// Inputs x[6], y[6]; output z[6]
+extern void bignum_sub_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
+
+// Subtract modulo p_521, z := (x - y) mod p_521, assuming x and y reduced
+// Inputs x[9], y[9]; output z[9]
+extern void bignum_sub_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]);
+
+// Subtract modulo p_sm2, z := (x - y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+extern void bignum_sub_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]);
+
+// Convert 4-digit (256-bit) bignum to big-endian bytes
+// Input x[4]; output z[32] (bytes)
+extern void bignum_tobebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert 6-digit (384-bit) bignum to big-endian bytes
+// Input x[6]; output z[48] (bytes)
+extern void bignum_tobebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Convert 4-digit (256-bit) bignum to little-endian bytes
+// Input x[4]; output z[32] (bytes)
+extern void bignum_tolebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert 6-digit (384-bit) bignum to little-endian bytes
+// Input x[6]; output z[48] (bytes)
+extern void bignum_tolebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Convert 9-digit 528-bit bignum to little-endian bytes
+// Input x[6]; output z[66] (bytes)
+extern void bignum_tolebytes_p521 (uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Convert to Montgomery form z := (2^256 * x) mod p_256
+// Input x[4]; output z[4]
+extern void bignum_tomont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_tomont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert to Montgomery form z := (2^256 * x) mod p_256k1
+// Input x[4]; output z[4]
+extern void bignum_tomont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_tomont_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Convert to Montgomery form z := (2^384 * x) mod p_384
+// Input x[6]; output z[6]
+extern void bignum_tomont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+extern void bignum_tomont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Convert to Montgomery form z := (2^576 * x) mod p_521
+// Input x[9]; output z[9]
+extern void bignum_tomont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Convert to Montgomery form z := (2^256 * x) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_tomont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Triple modulo p_256, z := (3 * x) mod p_256
+// Input x[4]; output z[4]
+extern void bignum_triple_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_triple_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Triple modulo p_256k1, z := (3 * x) mod p_256k1
+// Input x[4]; output z[4]
+extern void bignum_triple_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_triple_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Triple modulo p_384, z := (3 * x) mod p_384
+// Input x[6]; output z[6]
+extern void bignum_triple_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+extern void bignum_triple_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]);
+
+// Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced
+// Input x[9]; output z[9]
+extern void bignum_triple_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+extern void bignum_triple_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]);
+
+// Triple modulo p_sm2, z := (3 * x) mod p_sm2
+// Input x[4]; output z[4]
+extern void bignum_triple_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+extern void bignum_triple_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]);
+
+// Montgomery ladder step for curve25519
+// Inputs point[8], pp[16], b; output rr[16]
+extern void curve25519_ladderstep(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
+extern void curve25519_ladderstep_alt(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b);
+
+// Projective scalar multiplication, x coordinate only, for curve25519
+// Inputs scalar[4], point[4]; output res[8]
+extern void curve25519_pxscalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
+extern void curve25519_pxscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
+
+// x25519 function for curve25519
+// Inputs scalar[4], point[4]; output res[4]
+extern void curve25519_x25519(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
+extern void curve25519_x25519_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]);
+
+// x25519 function for curve25519 (byte array arguments)
+// Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes)
+extern void curve25519_x25519_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
+extern void curve25519_x25519_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]);
+
+// x25519 function for curve25519 on base element 9
+// Input scalar[4]; output res[4]
+extern void curve25519_x25519base(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+extern void curve25519_x25519base_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+
+// x25519 function for curve25519 on base element 9 (byte array arguments)
+// Input scalar[32] (bytes); output res[32] (bytes)
+extern void curve25519_x25519base_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
+extern void curve25519_x25519base_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]);
+
+// Decode compressed 256-bit form of edwards25519 point
+// Input c[32] (bytes); output function return and z[8]
+extern uint64_t edwards25519_decode(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
+extern uint64_t edwards25519_decode_alt(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]);
+
+// Encode edwards25519 point into compressed form as 256-bit number
+// Input p[8]; output z[32] (bytes)
+extern void edwards25519_encode(uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t p[S2N_BIGNUM_STATIC 8]);
+
+// Extended projective addition for edwards25519
+// Inputs p1[16], p2[16]; output p3[16]
+extern void edwards25519_epadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
+extern void edwards25519_epadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]);
+
+// Extended projective doubling for edwards25519
+// Inputs p1[12]; output p3[16]
+extern void edwards25519_epdouble(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void edwards25519_epdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+
+// Projective doubling for edwards25519
+// Inputs p1[12]; output p3[12]
+extern void edwards25519_pdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void edwards25519_pdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+
+// Extended projective + precomputed mixed addition for edwards25519
+// Inputs p1[16], p2[12]; output p3[16]
+extern void edwards25519_pepadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void edwards25519_pepadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+
+// Scalar multiplication by standard basepoint for edwards25519 (Ed25519)
+// Input scalar[4]; output res[8]
+extern void edwards25519_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]);
+
+// Double scalar multiplication for edwards25519, fresh and base point
+// Input scalar[4], point[8], bscalar[4]; output res[8]
+extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
+extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]);
+
+// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[12]; output p3[12]
+extern void p256_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+
+// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
+// Inputs p1[12]; output p3[12]
+extern void p256_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+
+// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[8]; output p3[12]
+extern void p256_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void p256_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+
+// Montgomery-Jacobian form scalar multiplication for P-256
+// Input scalar[4], point[12]; output res[12]
+extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+
+// Scalar multiplication for NIST curve P-256
+// Input scalar[4], point[8]; output res[8]
+extern void p256_scalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
+extern void p256_scalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]);
+
+// Scalar multiplication for precomputed point on NIST curve P-256
+// Input scalar[4], blocksize, table[]; output res[8]
+extern void p256_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
+extern void p256_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table);
+
+// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates
+// Inputs p1[18], p2[18]; output p3[18]
+extern void p384_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+
+// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates
+// Inputs p1[18]; output p3[18]
+extern void p384_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]);
+
+// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates
+// Inputs p1[18], p2[12]; output p3[18]
+extern void p384_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void p384_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+
+// Montgomery-Jacobian form scalar multiplication for P-384
+// Input scalar[6], point[18]; output res[18]
+extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
+extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]);
+
+// Point addition on NIST curve P-521 in Jacobian coordinates
+// Inputs p1[27], p2[27]; output p3[27]
+extern void p521_jadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
+extern void p521_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]);
+
+// Point doubling on NIST curve P-521 in Jacobian coordinates
+// Input p1[27]; output p3[27]
+extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
+extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]);
+
+// Point mixed addition on NIST curve P-521 in Jacobian coordinates
+// Inputs p1[27], p2[18]; output p3[27]
+extern void p521_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+extern void p521_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]);
+
+// Jacobian form scalar multiplication for P-521
+// Input scalar[9], point[27]; output res[27]
+extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
+extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]);
+
+// Point addition on SECG curve secp256k1 in Jacobian coordinates
+// Inputs p1[12], p2[12]; output p3[12]
+extern void secp256k1_jadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void secp256k1_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+
+// Point doubling on SECG curve secp256k1 in Jacobian coordinates
+// Input p1[12]; output p3[12]
+extern void secp256k1_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void secp256k1_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+
+// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates
+// Inputs p1[12], p2[8]; output p3[12]
+extern void secp256k1_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void secp256k1_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+
+// Point addition on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[12]; output p3[12]
+extern void sm2_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]);
+
+// Point doubling on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12]; output p3[12]
+extern void sm2_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]);
+
+// Point mixed addition on CC curve SM2 in Montgomery-Jacobian coordinates
+// Inputs p1[12], p2[8]; output p3[12]
+extern void sm2_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+extern void sm2_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]);
+
+// Montgomery-Jacobian form scalar multiplication for CC curve SM2
+// Input scalar[4], point[12]; output res[12]
+extern void sm2_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+extern void sm2_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]);
+
+// Reverse the bytes in a single word
+// Input a; output function return
+extern uint64_t word_bytereverse (uint64_t a);
+
+// Count leading zero bits in a single word
+// Input a; output function return
+extern uint64_t word_clz (uint64_t a);
+
+// Count trailing zero bits in a single word
+// Input a; output function return
+extern uint64_t word_ctz (uint64_t a);
+
+// Perform 59 "divstep" iterations and return signed matrix of updates
+// Inputs d, f, g; output m[2][2] and function return
+extern int64_t word_divstep59(int64_t m[2][2],int64_t d,uint64_t f,uint64_t g);
+
+// Return maximum of two unsigned 64-bit words
+// Inputs a, b; output function return
+extern uint64_t word_max (uint64_t a, uint64_t b);
+
+// Return minimum of two unsigned 64-bit words
+// Inputs a, b; output function return
+extern uint64_t word_min (uint64_t a, uint64_t b);
+
+// Single-word negated modular inverse (-1/a) mod 2^64
+// Input a; output function return
+extern uint64_t word_negmodinv (uint64_t a);
+
+// Count number of set bits in a single 64-bit word (population count)
+// Input a; output function return
+extern uint64_t word_popcount (uint64_t a);
+
+// Single-word reciprocal, 2^64 + ret = ceil(2^128/a) - 1 if MSB of "a" is set
+// Input a; output function return
+extern uint64_t word_recip (uint64_t a);
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/non_ct_functions.txt b/third_party/s2n-bignum/s2n-bignum-imported/non_ct_functions.txt
new file mode 100644
index 00000000000..5b9fe753cd1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/non_ct_functions.txt
@@ -0,0 +1,8 @@
+p256/bignum_mod_n256o:
+p256/bignum_mod_n256_alto:
+p256/bignum_mod_p256o:
+p256/bignum_mod_p256_alto:
+p384/bignum_mod_n384o:
+p384/bignum_mod_n384_alto:
+p384/bignum_mod_p384o:
+p384/bignum_mod_p384_alto:
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile
new file mode 100644
index 00000000000..075ec11a61f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile
@@ -0,0 +1,343 @@
+#############################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+#############################################################################
+
+# These are the object files corresponding to sources for translation
+
+OBJ = curve25519/bignum_add_p25519.o \
+      curve25519/bignum_cmul_p25519.o \
+      curve25519/bignum_cmul_p25519_alt.o \
+      curve25519/bignum_double_p25519.o \
+      curve25519/bignum_inv_p25519.o \
+      curve25519/bignum_invsqrt_p25519.o \
+      curve25519/bignum_invsqrt_p25519_alt.o \
+      curve25519/bignum_madd_n25519.o \
+      curve25519/bignum_madd_n25519_alt.o \
+      curve25519/bignum_mod_m25519_4.o \
+      curve25519/bignum_mod_n25519.o \
+      curve25519/bignum_mod_n25519_4.o \
+      curve25519/bignum_mod_p25519_4.o \
+      curve25519/bignum_mul_p25519.o \
+      curve25519/bignum_mul_p25519_alt.o \
+      curve25519/bignum_neg_p25519.o \
+      curve25519/bignum_optneg_p25519.o \
+      curve25519/bignum_sqr_p25519.o \
+      curve25519/bignum_sqr_p25519_alt.o \
+      curve25519/bignum_sqrt_p25519.o \
+      curve25519/bignum_sqrt_p25519_alt.o \
+      curve25519/bignum_sub_p25519.o \
+      curve25519/curve25519_ladderstep.o \
+      curve25519/curve25519_ladderstep_alt.o \
+      curve25519/curve25519_pxscalarmul.o \
+      curve25519/curve25519_pxscalarmul_alt.o \
+      curve25519/curve25519_x25519.o \
+      curve25519/curve25519_x25519_alt.o \
+      curve25519/curve25519_x25519base.o \
+      curve25519/curve25519_x25519base_alt.o \
+      curve25519/edwards25519_decode.o \
+      curve25519/edwards25519_decode_alt.o \
+      curve25519/edwards25519_encode.o \
+      curve25519/edwards25519_epadd.o \
+      curve25519/edwards25519_epadd_alt.o \
+      curve25519/edwards25519_epdouble.o \
+      curve25519/edwards25519_epdouble_alt.o \
+      curve25519/edwards25519_pdouble.o \
+      curve25519/edwards25519_pdouble_alt.o \
+      curve25519/edwards25519_pepadd.o \
+      curve25519/edwards25519_pepadd_alt.o \
+      curve25519/edwards25519_scalarmulbase.o \
+      curve25519/edwards25519_scalarmulbase_alt.o \
+      curve25519/edwards25519_scalarmuldouble.o \
+      curve25519/edwards25519_scalarmuldouble_alt.o \
+      fastmul/bignum_emontredc_8n.o \
+      fastmul/bignum_kmul_16_32.o \
+      fastmul/bignum_kmul_32_64.o \
+      fastmul/bignum_ksqr_16_32.o \
+      fastmul/bignum_ksqr_32_64.o \
+      fastmul/bignum_mul_4_8.o \
+      fastmul/bignum_mul_4_8_alt.o \
+      fastmul/bignum_mul_6_12.o \
+      fastmul/bignum_mul_6_12_alt.o \
+      fastmul/bignum_mul_8_16.o \
+      fastmul/bignum_mul_8_16_alt.o \
+      fastmul/bignum_sqr_4_8.o \
+      fastmul/bignum_sqr_4_8_alt.o \
+      fastmul/bignum_sqr_6_12.o \
+      fastmul/bignum_sqr_6_12_alt.o \
+      fastmul/bignum_sqr_8_16.o \
+      fastmul/bignum_sqr_8_16_alt.o \
+      generic/bignum_add.o \
+      generic/bignum_amontifier.o \
+      generic/bignum_amontmul.o \
+      generic/bignum_amontredc.o \
+      generic/bignum_amontsqr.o \
+      generic/bignum_bitfield.o \
+      generic/bignum_bitsize.o \
+      generic/bignum_cdiv.o \
+      generic/bignum_cdiv_exact.o \
+      generic/bignum_cld.o \
+      generic/bignum_clz.o \
+      generic/bignum_cmadd.o \
+      generic/bignum_cmnegadd.o \
+      generic/bignum_cmod.o \
+      generic/bignum_cmul.o \
+      generic/bignum_coprime.o \
+      generic/bignum_copy.o \
+      generic/bignum_ctd.o \
+      generic/bignum_ctz.o \
+      generic/bignum_demont.o \
+      generic/bignum_digit.o \
+      generic/bignum_digitsize.o \
+      generic/bignum_divmod10.o \
+      generic/bignum_emontredc.o \
+      generic/bignum_eq.o \
+      generic/bignum_even.o \
+      generic/bignum_ge.o \
+      generic/bignum_gt.o \
+      generic/bignum_iszero.o \
+      generic/bignum_le.o \
+      generic/bignum_lt.o \
+      generic/bignum_madd.o \
+      generic/bignum_modadd.o \
+      generic/bignum_moddouble.o \
+      generic/bignum_modexp.o \
+      generic/bignum_modifier.o \
+      generic/bignum_modinv.o \
+      generic/bignum_modoptneg.o \
+      generic/bignum_modsub.o \
+      generic/bignum_montifier.o \
+      generic/bignum_montmul.o \
+      generic/bignum_montredc.o \
+      generic/bignum_montsqr.o \
+      generic/bignum_mul.o \
+      generic/bignum_muladd10.o \
+      generic/bignum_mux.o \
+      generic/bignum_mux16.o \
+      generic/bignum_negmodinv.o \
+      generic/bignum_nonzero.o \
+      generic/bignum_normalize.o \
+      generic/bignum_odd.o \
+      generic/bignum_of_word.o \
+      generic/bignum_optadd.o \
+      generic/bignum_optneg.o \
+      generic/bignum_optsub.o \
+      generic/bignum_optsubadd.o \
+      generic/bignum_pow2.o \
+      generic/bignum_shl_small.o \
+      generic/bignum_shr_small.o \
+      generic/bignum_sqr.o \
+      generic/bignum_sub.o \
+      generic/word_bytereverse.o \
+      generic/word_clz.o \
+      generic/word_ctz.o \
+      generic/word_divstep59.o \
+      generic/word_max.o \
+      generic/word_min.o \
+      generic/word_negmodinv.o \
+      generic/word_popcount.o \
+      generic/word_recip.o \
+      p256/bignum_add_p256.o \
+      p256/bignum_bigendian_4.o \
+      p256/bignum_cmul_p256.o \
+      p256/bignum_cmul_p256_alt.o \
+      p256/bignum_deamont_p256.o \
+      p256/bignum_deamont_p256_alt.o \
+      p256/bignum_demont_p256.o \
+      p256/bignum_demont_p256_alt.o \
+      p256/bignum_double_p256.o \
+      p256/bignum_half_p256.o \
+      p256/bignum_inv_p256.o \
+      p256/bignum_littleendian_4.o \
+      p256/bignum_mod_n256.o \
+      p256/bignum_mod_n256_alt.o \
+      p256/bignum_mod_n256_4.o \
+      p256/bignum_mod_p256.o \
+      p256/bignum_mod_p256_alt.o \
+      p256/bignum_mod_p256_4.o \
+      p256/bignum_montinv_p256.o \
+      p256/bignum_montmul_p256.o \
+      p256/bignum_montmul_p256_alt.o \
+      p256/bignum_montsqr_p256.o \
+      p256/bignum_montsqr_p256_alt.o \
+      p256/bignum_mux_4.o \
+      p256/bignum_neg_p256.o \
+      p256/bignum_nonzero_4.o \
+      p256/bignum_optneg_p256.o \
+      p256/bignum_sub_p256.o \
+      p256/bignum_tomont_p256.o \
+      p256/bignum_tomont_p256_alt.o \
+      p256/bignum_triple_p256.o \
+      p256/bignum_triple_p256_alt.o \
+      p256/p256_montjadd.o \
+      p256/p256_montjadd_alt.o \
+      p256/p256_montjdouble.o \
+      p256/p256_montjdouble_alt.o \
+      p256/p256_montjmixadd.o \
+      p256/p256_montjmixadd_alt.o \
+      p256/p256_montjscalarmul.o \
+      p256/p256_montjscalarmul_alt.o \
+      p256/p256_scalarmul.o \
+      p256/p256_scalarmul_alt.o \
+      p256/p256_scalarmulbase.o \
+      p256/p256_scalarmulbase_alt.o \
+      p384/bignum_add_p384.o \
+      p384/bignum_bigendian_6.o \
+      p384/bignum_cmul_p384.o \
+      p384/bignum_cmul_p384_alt.o \
+      p384/bignum_deamont_p384.o \
+      p384/bignum_deamont_p384_alt.o \
+      p384/bignum_demont_p384.o \
+      p384/bignum_demont_p384_alt.o \
+      p384/bignum_double_p384.o \
+      p384/bignum_half_p384.o \
+      p384/bignum_inv_p384.o \
+      p384/bignum_littleendian_6.o \
+      p384/bignum_mod_n384_alt.o \
+      p384/bignum_mod_n384.o \
+      p384/bignum_mod_n384_6.o \
+      p384/bignum_mod_p384.o \
+      p384/bignum_mod_p384_alt.o \
+      p384/bignum_mod_p384_6.o \
+      p384/bignum_montinv_p384.o \
+      p384/bignum_montmul_p384.o \
+      p384/bignum_montmul_p384_alt.o \
+      p384/bignum_montsqr_p384.o \
+      p384/bignum_montsqr_p384_alt.o \
+      p384/bignum_mux_6.o \
+      p384/bignum_neg_p384.o \
+      p384/bignum_nonzero_6.o \
+      p384/bignum_optneg_p384.o \
+      p384/bignum_sub_p384.o \
+      p384/bignum_tomont_p384.o \
+      p384/bignum_tomont_p384_alt.o \
+      p384/bignum_triple_p384.o \
+      p384/bignum_triple_p384_alt.o \
+      p384/p384_montjadd.o \
+      p384/p384_montjadd_alt.o \
+      p384/p384_montjdouble.o \
+      p384/p384_montjdouble_alt.o \
+      p384/p384_montjmixadd.o \
+      p384/p384_montjmixadd_alt.o \
+      p384/p384_montjscalarmul.o \
+      p384/p384_montjscalarmul_alt.o \
+      p521/bignum_add_p521.o \
+      p521/bignum_cmul_p521.o \
+      p521/bignum_cmul_p521_alt.o \
+      p521/bignum_deamont_p521.o \
+      p521/bignum_demont_p521.o \
+      p521/bignum_double_p521.o \
+      p521/bignum_fromlebytes_p521.o \
+      p521/bignum_half_p521.o \
+      p521/bignum_inv_p521.o \
+      p521/bignum_mod_n521_9.o \
+      p521/bignum_mod_n521_9_alt.o \
+      p521/bignum_mod_p521_9.o \
+      p521/bignum_montmul_p521.o \
+      p521/bignum_montmul_p521_alt.o \
+      p521/bignum_montsqr_p521.o \
+      p521/bignum_montsqr_p521_alt.o \
+      p521/bignum_mul_p521.o \
+      p521/bignum_mul_p521_alt.o \
+      p521/bignum_neg_p521.o \
+      p521/bignum_optneg_p521.o \
+      p521/bignum_sqr_p521.o \
+      p521/bignum_sqr_p521_alt.o \
+      p521/bignum_sub_p521.o \
+      p521/bignum_tolebytes_p521.o \
+      p521/bignum_tomont_p521.o \
+      p521/bignum_triple_p521.o \
+      p521/bignum_triple_p521_alt.o \
+      p521/p521_jadd.o \
+      p521/p521_jadd_alt.o \
+      p521/p521_jdouble.o \
+      p521/p521_jdouble_alt.o \
+      p521/p521_jmixadd.o \
+      p521/p521_jmixadd_alt.o \
+      p521/p521_jscalarmul.o \
+      p521/p521_jscalarmul_alt.o \
+      secp256k1/bignum_add_p256k1.o \
+      secp256k1/bignum_cmul_p256k1.o \
+      secp256k1/bignum_cmul_p256k1_alt.o \
+      secp256k1/bignum_deamont_p256k1.o \
+      secp256k1/bignum_demont_p256k1.o \
+      secp256k1/bignum_double_p256k1.o \
+      secp256k1/bignum_half_p256k1.o \
+      secp256k1/bignum_mod_n256k1_4.o \
+      secp256k1/bignum_mod_p256k1_4.o \
+      secp256k1/bignum_montmul_p256k1.o \
+      secp256k1/bignum_montmul_p256k1_alt.o \
+      secp256k1/bignum_montsqr_p256k1.o \
+      secp256k1/bignum_montsqr_p256k1_alt.o \
+      secp256k1/bignum_mul_p256k1.o \
+      secp256k1/bignum_mul_p256k1_alt.o \
+      secp256k1/bignum_neg_p256k1.o \
+      secp256k1/bignum_optneg_p256k1.o \
+      secp256k1/bignum_sqr_p256k1.o \
+      secp256k1/bignum_sqr_p256k1_alt.o \
+      secp256k1/bignum_sub_p256k1.o \
+      secp256k1/bignum_tomont_p256k1.o \
+      secp256k1/bignum_tomont_p256k1_alt.o \
+      secp256k1/bignum_triple_p256k1.o \
+      secp256k1/bignum_triple_p256k1_alt.o \
+      secp256k1/secp256k1_jadd.o \
+      secp256k1/secp256k1_jadd_alt.o \
+      secp256k1/secp256k1_jdouble.o \
+      secp256k1/secp256k1_jdouble_alt.o \
+      secp256k1/secp256k1_jmixadd.o \
+      secp256k1/secp256k1_jmixadd_alt.o \
+      sm2/bignum_add_sm2.o \
+      sm2/bignum_cmul_sm2.o \
+      sm2/bignum_cmul_sm2_alt.o \
+      sm2/bignum_deamont_sm2.o \
+      sm2/bignum_demont_sm2.o \
+      sm2/bignum_double_sm2.o \
+      sm2/bignum_half_sm2.o \
+      sm2/bignum_inv_sm2.o \
+      sm2/bignum_mod_nsm2.o \
+      sm2/bignum_mod_nsm2_alt.o \
+      sm2/bignum_mod_nsm2_4.o \
+      sm2/bignum_mod_sm2.o \
+      sm2/bignum_mod_sm2_4.o \
+      sm2/bignum_montinv_sm2.o \
+      sm2/bignum_montmul_sm2.o \
+      sm2/bignum_montmul_sm2_alt.o \
+      sm2/bignum_montsqr_sm2.o \
+      sm2/bignum_montsqr_sm2_alt.o \
+      sm2/bignum_neg_sm2.o \
+      sm2/bignum_optneg_sm2.o \
+      sm2/bignum_sub_sm2.o \
+      sm2/bignum_tomont_sm2.o \
+      sm2/bignum_triple_sm2.o \
+      sm2/bignum_triple_sm2_alt.o \
+      sm2/sm2_montjadd.o \
+      sm2/sm2_montjadd_alt.o \
+      sm2/sm2_montjdouble.o \
+      sm2/sm2_montjdouble_alt.o \
+      sm2/sm2_montjmixadd.o \
+      sm2/sm2_montjmixadd_alt.o \
+      sm2/sm2_montjscalarmul.o \
+      sm2/sm2_montjscalarmul_alt.o
+
+# The AT&T syntax source files
+
+ATTSOURCES = $(OBJ:.o=.S)
+
+code: $(ATTSOURCES)
+
+all: $(OBJ);
+
+%.o : %.S ; ($(CC) -E -I../include $< | as -o $@ -); (cd ../x86; $(CC) -E -I../include $< | as -o /tmp/original_object.o); cmp -s $@ /tmp/original_object.o
+
+curve25519/%.S :: ../x86/curve25519/%.S ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj)
+fastmul/%.S :: ../x86/fastmul/%.S       ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj)
+generic/%.S :: ../x86/generic/%.S       ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj)
+p256/%.S :: ../x86/p256/%.S             ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj)
+p384/%.S :: ../x86/p384/%.S             ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj)
+p521/%.S :: ../x86/p521/%.S             ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj)
+secp256k1/%.S :: ../x86/secp256k1/%.S   ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj)
+sm2/%.S :: ../x86/sm2/%.S               ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj)
+
+clean:; rm -f */*.o
+
+clobber:; rm -f */*.o */*.S
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/README.md b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/README.md
new file mode 100644
index 00000000000..2dd37851cb3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/README.md
@@ -0,0 +1,16 @@
+## AT&T syntax versions
+
+This directory contains AT&T syntax equivalents of the original Intel
+syntax assembler files, generated automatically by a naive script and
+subject to a sanity check that the object code doesn't change. All the
+*/*.S files are generated ("make code"). Direct modification of these
+files is not recommended.
+
+        make code    --- Generate */*.S files, subject to sanity check
+        make all     --- Generate */*.S and */*.o files with sanity check
+        make clean   --- Delete object files
+        make clobber --- Delete object files and generated code
+
+For more on the two syntax variants see:
+
+        https://en.wikipedia.org/wiki/X86_assembly_language#Syntax
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/attrofy.sed b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/attrofy.sed
new file mode 100644
index 00000000000..40547107795
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/attrofy.sed
@@ -0,0 +1,136 @@
+#############################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+#############################################################################
+
+ ############################################################################
+ #                         * * * NOTE * * *                                 #
+ #                                                                          #
+ # This is a primitive script to automate conversion of certain particular  #
+ # x86 assembler files from Intel to AT&T syntax. It is *not* a general     #
+ # conversion and is very tied to the specific limitations and conventions  #
+ # in the intended targets. Even in that setting we only use it with an     #
+ # additional sanity check that the object code generated is the same in    #
+ # both original and translated code according to the GNU assembler.        #
+ ############################################################################
+
+s/\.intel_syntax *noprefix//
+
+# Don't make any transforms on lines with the argument-taking macros
+
+/ addrow .+,/b
+/ mulpadd .+,/b
+/ mulpadda .+,/b
+/ mulpade .+,/b
+/ mulrow .+,/b
+
+# Reverse the argument order for binary and ternary instructions
+
+s/^(([a-z_0-9]+\:)* +[a-z_0-9]+ +)([^ (][^,/]*), *([^ ][^/,;]*)([/;].*)*$/\1\4, \3 \5/
+s/^(([a-z_0-9]+\:)* +[a-z_0-9]+ +)([^ (][^,/]*), *([^ ][^/,]*), *([^ ][^/,;]*)([/;].*)*$/\1\5, \4, \3 \6/
+
+# Fix up whitespace just in case
+
+s/ +,/,/
+
+# Decorate literals with $
+
+s/^(([a-z_0-9]+\:)* +[a-z_0-9]+ +)(([-~+*/()A-Z0-9]*(0x[a-zA-Z0-9]*)*)* *\,)/\1$\3/
+
+# Translate relative addresses with uppercase base variable
+# Turn defined offset fields into explicit indirections to match
+
+s/^([^/][^[]+)[[]([A-Z_0-9]+)[]]/\1\2/
+s/^([^/][^[]+)[[]([A-Z][A-Z_0-9]*) *\+ *([^]]+)[]]/\1\3\+\2/
+
+s/^\#define *([a-z][a-z_0-9]*) *([a-z][a-z_0-9]*) *\+(.*)/\#define \1 \3\(\2\)/
+
+# Translate relative addresses
+
+s/^([^/][^[]+)[[]([a-z_0-9]+)[]]/\1\(\2\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *8\*([a-z][a-z_0-9]*) *\+ *([a-z_A-Z0-9]+)[]]/\1\4\(\2,\3,8\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *([a-z][a-z_0-9]*) *\+ *([a-z_A-Z0-9]+)[]]/\1\4\(\2,\3,1\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *8\*([a-z][a-z_0-9]*) *\- *([a-z_A-Z0-9]+)[]]/\1\-\4\(\2,\3,8\)/
+s/^([^/][^[]+)[[](rip) *\+ *([a-z_A-Z0-9* ]+)[]]/\1\3\(\2\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *([A-Z0-9* ]+)[]]/\1\3\(\2\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\- *([A-Z0-9* ]+)[]]/\1\-\3\(\2\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *8\*([a-z][a-z_0-9]*)[]]/\1\(\2,\3,8\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *4\*([a-z][a-z_0-9]*)[]]/\1\(\2,\3,4\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *2\*([a-z][a-z_0-9]*)[]]/\1\(\2,\3,2\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *([a-z][a-z_0-9]*)[]]/\1\(\2,\3\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *([^]]+)[]]/\1\3\(\2\)/
+s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\- *([^]]+)[]]/\1-\3\(\2\)/
+s/^([^/][^[]+)[[]([^]]+)[]]/\1\(\2\)/
+
+# Put % in front of register names
+
+s/ ax *$/ %ax/
+s/ ax,/ %ax,/
+s/ cl *$/ %cl/
+s/ cl,/ %cl,/
+s/([[(,.;: ])([re][abcd]x)/\1\%\2/g
+s/([[(,.;: ])([re]sp)/\1\%\2/g
+s/([[(,.;: ])([re]bp)/\1\%\2/g
+s/([[(,.;: ])([re]si)/\1\%\2/g
+s/([[(,.;: ])([re]di)/\1\%\2/g
+s/([[(,.;: ])(r8d*)/\1\%\2/g
+s/([[(,.;: ])(r9d*)/\1\%\2/g
+s/([[(,.;: ])(r1[0-5]d*)/\1\%\2/g
+s/([[(,.;: ])([re]ip)/\1\%\2/g
+
+# Add explicit sizes to instructions
+
+s/QWORD PTR//g
+
+s/ adc  / adcq /g
+s/ adcx  / adcxq /g
+s/ add  / addq /g
+s/ adox  / adoxq /g
+s/ and  / andq /g
+s/ bsf  / bsfq /g
+s/ bsr  / bsrq /g
+s/ bswap  / bswapq /g
+s/ bt  / btq /g
+s/ call  / callq /g
+s/ cmovae  / cmovaeq /g
+s/ cmovb  / cmovbq /g
+s/ cmovc  / cmovcq /g
+s/ cmove  / cmoveq /g
+s/ cmovnc  / cmovncq /g
+s/ cmovne  / cmovneq /g
+s/ cmovnz  / cmovnzq /g
+s/ cmovz  / cmovzq /g
+s/ cmp  / cmpq /g
+s/ dec  / decq /g
+s/ imul  / imulq /g
+s/ inc  / incq /g
+s/ lea  / leaq /g
+s/ mov  / movq /g
+s/ movabs  / movabsq /g
+s/ mul  / mulq /g
+s/ mulx  / mulxq /g
+s/ neg  / negq /g
+s/ not  / notq /g
+s/ or  / orq /g
+s/ pop  / popq /g
+s/ push  / pushq /g
+s/ sar  / sarq /g
+s/ sbb  / sbbq /g
+s/ shl  / shlq /g
+s/ shld  / shldq /g
+s/ shr  / shrq /g
+s/ shrd  / shrdq /g
+s/ sub  / subq /g
+s/ test  / testq /g
+s/ xor  / xorq /g
+
+s/q(  .*zeroe)/l\1/
+s/q(  .*plus2e)/l\1/
+s/q(  .*short)/l\1/
+s/q(  .*%e)/l\1/
+s/q(  .*%r[0-9]+d)/l\1/
+s/q(  .*%ax)/w\1/
+
+# Eliminate any trailing spaces, just to be tidy
+
+s/ +$//
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_add_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_add_p25519.S
new file mode 100644
index 00000000000..b4c3f21fb78
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_add_p25519.S
@@ -0,0 +1,103 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_add_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p25519)
+        .text
+
+#define z %rdi
+#define x %rsi
+#define y %rdx
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// These also re-use inputs x and y when safe to do so
+
+#define c0 %rax
+#define c1 %rcx
+#define c2 %rsi
+#define c3 %rdx
+#define c0short %eax
+#define c1short %ecx
+#define c2short %esi
+#define c3short %edx
+
+S2N_BN_SYMBOL(bignum_add_p25519):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Add as [d3; d2; d1; d0] = x + y; since we assume x, y < 2^255 - 19
+// this sum fits in 256 bits.
+
+        movq    (x), d0
+        addq    (y), d0
+        movq    8(x), d1
+        adcq    8(y), d1
+        movq    16(x), d2
+        adcq    16(y), d2
+        movq    24(x), d3
+        adcq    24(y), d3
+
+// Now x + y >= 2^255 - 19 <=> x + y + 19 >= 2^255.
+// Form [c3; c2; c1; c0] = (x + y) + 19
+
+        movl    $19, c0short
+        xorl    c1short, c1short
+        xorl    c2short, c2short
+        xorl    c3short, c3short
+
+        addq    d0, c0
+        adcq    d1, c1
+        adcq    d2, c2
+        adcq    d3, c3
+
+// Test the top bit to see if this is >= 2^255, and clear it as a masking
+// so that in that case the result is exactly (x + y) - (2^255 - 19).
+// Then select the output according to that top bit as that or just x + y.
+
+        btr     $63, c3
+        cmovcq  c0, d0
+        cmovcq  c1, d1
+        cmovcq  c2, d2
+        cmovcq  c3, d3
+
+// Store the result
+
+        movq    d0, (z)
+        movq    d1, 8(z)
+        movq    d2, 16(z)
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519.S
new file mode 100644
index 00000000000..4eed06e8532
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519.S
@@ -0,0 +1,113 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_p25519
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = c, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p25519)
+        .text
+
+#define z %rdi
+
+// Temporarily moved here for initial multiply
+
+#define x %rcx
+#define c %rcx
+#define cshort %ecx
+
+// Used as a zero register after the initial move
+
+#define zero %rsi
+#define zeroe %esi
+
+// Likewise this is thrown away after initial multiply
+
+#define d %rdx
+#define a %rax
+#define ashort %eax
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+S2N_BN_SYMBOL(bignum_cmul_p25519):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Shuffle inputs (since we want multiplier in %rdx)
+
+        movq    %rdx, x
+        movq    %rsi, d
+
+// Multiply, accumulating the result as [d4;d3;d2;d1;d0]
+
+        mulxq   (x), d0, d1
+        mulxq   8(x), a, d2
+        addq    a, d1
+        mulxq   16(x), a, d3
+        adcq    a, d2
+        mulxq   24(x), d, a
+        adcq    d, d3
+        adcq    $0, a
+
+// Let [d4;d3;d2;d1;d0] = 2^255 * h + l, and use q = h + 1 as the initial
+// quotient estimate, which is either right or 1 too big.
+
+        shldq   $1, d3, a
+        movl    $19, cshort
+        incq    a
+        bts     $63, d3
+        mulq    c
+        xorl    zeroe, zeroe
+        addq    a, d0
+        adcq    d, d1
+        adcq    zero, d2
+        adcq    zero, d3
+
+// Correct if CF = 0 by subtracting 19, either way masking to
+// 255 bits, i.e. by effectively adding p_25519 to the "full" answer
+
+        cmovcq  zero, c
+        subq    c, d0
+        sbbq    zero, d1
+        sbbq    zero, d2
+        sbbq    zero, d3
+        btr     $63, d3
+
+// Write everything back
+
+        movq    d0, (z)
+        movq    d1, 8(z)
+        movq    d2, 16(z)
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519_alt.S
new file mode 100644
index 00000000000..e31805f2342
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519_alt.S
@@ -0,0 +1,127 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_p25519_alt
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = c, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p25519_alt)
+        .text
+
+#define z %rdi
+
+// Temporarily moved here for initial multiply
+
+#define x %rcx
+
+// Used as a zero register after the initial move
+
+#define zero %rsi
+#define zeroe %esi
+
+// Likewise this is thrown away after initial multiply
+
+#define d %rdx
+#define a %rax
+#define ashort %eax
+
+#define c %rcx
+#define cshort %ecx
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+#define d4 %rdx
+
+S2N_BN_SYMBOL(bignum_cmul_p25519_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Shuffle inputs (since we want %rdx for the high parts of products)
+
+        movq    %rdx, x
+
+// Multiply, accumulating the result as [d4;d3;d2;d1;d0]
+
+        movq    (x), a
+        mulq    %rsi
+        movq    a, d0
+        movq    d, d1
+
+        movq    8(x), a
+        xorq    d2, d2
+        mulq    %rsi
+        addq    a, d1
+        adcq    d, d2
+
+        movq    16(x), a
+        mulq    %rsi
+        addq    a, d2
+        adcq    $0, d
+
+        movq    24(x), a
+        movq    d, d3
+        mulq    %rsi
+        xorl    zeroe, zeroe
+        addq    a, d3
+        adcq    zero, d4
+
+// Let [d4;d3;d2;d1;d0] = 2^255 * h + l, and use q = h + 1 as the initial
+// quotient estimate, which is either right or 1 too big.
+
+        shldq   $1, d3, d4
+        movl    $19, cshort
+        leaq    1(d4), a
+        bts     $63, d3
+        mulq    c
+        addq    a, d0
+        adcq    d, d1
+        adcq    zero, d2
+        adcq    zero, d3
+
+// Correct if CF = 0 by subtracting 19, either way masking to
+// 255 bits, i.e. by effectively adding p_25519 to the "full" answer
+
+        cmovcq  zero, c
+        subq    c, d0
+        sbbq    zero, d1
+        sbbq    zero, d2
+        sbbq    zero, d3
+        btr     $63, d3
+
+// Write everything back
+
+        movq    d0, (z)
+        movq    d1, 8(z)
+        movq    d2, 16(z)
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_double_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_double_p25519.S
new file mode 100644
index 00000000000..dec97c2c98c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_double_p25519.S
@@ -0,0 +1,101 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_double_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p25519)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// These also re-use input x when safe to do so
+
+#define c0 %rax
+#define c1 %rcx
+#define c2 %rsi
+#define c3 %rdx
+#define c0short %eax
+#define c1short %ecx
+#define c2short %esi
+#define c3short %edx
+
+S2N_BN_SYMBOL(bignum_double_p25519):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Add as [d3; d2; d1; d0] = 2 * x; since we assume x < 2^255 - 19
+// this result fits in 256 bits.
+
+        movq    (x), d0
+        addq    d0, d0
+        movq    8(x), d1
+        adcq    d1, d1
+        movq    16(x), d2
+        adcq    d2, d2
+        movq    24(x), d3
+        adcq    d3, d3
+
+// Now 2 * x >= 2^255 - 19 <=> 2 * x + 19 >= 2^255.
+// Form [c3; c2; c1; c0] = (2 * x) + 19
+
+        movl    $19, c0short
+        xorl    c1short, c1short
+        xorl    c2short, c2short
+        xorl    c3short, c3short
+
+        addq    d0, c0
+        adcq    d1, c1
+        adcq    d2, c2
+        adcq    d3, c3
+
+// Test the top bit to see if this is >= 2^255, and clear it as a masking
+// so that in that case the result is exactly (2 * x) - (2^255 - 19).
+// Then select the output according to that top bit as that or just 2 * x.
+
+        btr     $63, c3
+        cmovcq  c0, d0
+        cmovcq  c1, d1
+        cmovcq  c2, d2
+        cmovcq  c3, d3
+
+// Store the result
+
+        movq    d0, (z)
+        movq    d1, 8(z)
+        movq    d2, 16(z)
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_inv_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_inv_p25519.S
new file mode 100644
index 00000000000..f83974a21f4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_inv_p25519.S
@@ -0,0 +1,1587 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular inverse modulo p_25519 = 2^255 - 19
+// Input x[4]; output z[4]
+//
+// extern void bignum_inv_p25519(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Assuming the 4-digit input x is coprime to p_25519, i.e. is not divisible
+// by it, returns z < p_25519 such that x * z == 1 (mod p_25519). The input
+// x does not need to be reduced modulo p_25519, but the output always is.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p25519)
+        .text
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f 0(%rsp)
+#define g (4*N)(%rsp)
+#define u (8*N)(%rsp)
+#define v (12*N)(%rsp)
+#define tmp  (16*N)(%rsp)
+#define tmp2  (17*N)(%rsp)
+#define i  (18*N)(%rsp)
+#define d  (19*N)(%rsp)
+
+#define mat (20*N)(%rsp)
+
+// Backup for the input pointer
+
+#define res  (24*N)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (26*N)
+
+// Syntactic variants to make x86_att version simpler to generate
+
+#define F 0
+#define G (4*N)
+#define U (8*N)
+#define V (12*N)
+#define MAT (20*N)
+
+#define ff  (%rsp)
+#define gg  (4*N)(%rsp)
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix as
+//
+// [ %r8   %r10]
+// [ %r12  %r14]
+//
+// and also returning the matrix still negated (which doesn't matter)
+
+#define divstep59(din,fin,gin)                                          \
+        movq    din, %rsi ;                                               \
+        movq    fin, %rdx ;                                               \
+        movq    gin, %rcx ;                                               \
+        movq    %rdx, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        xorl    %ebp, %ebp ;                                               \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %rdx ;                                         \
+        leaq    (%rcx,%rax), %rdi ;                                         \
+        shlq    $0x16, %rdx ;                                              \
+        shlq    $0x16, %rdi ;                                              \
+        sarq    $0x2b, %rdx ;                                              \
+        sarq    $0x2b, %rdi ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %rbx ;                                         \
+        leaq    (%rcx,%rax), %rcx ;                                         \
+        sarq    $0x2a, %rbx ;                                              \
+        sarq    $0x2a, %rcx ;                                              \
+        movq    %rdx, MAT(%rsp) ;                                         \
+        movq    %rbx, MAT+0x8(%rsp) ;                                     \
+        movq    %rdi, MAT+0x10(%rsp) ;                                    \
+        movq    %rcx, MAT+0x18(%rsp) ;                                    \
+        movq    fin, %r12 ;                                               \
+        imulq   %r12, %rdi ;                                               \
+        imulq   %rdx, %r12 ;                                               \
+        movq    gin, %r13 ;                                               \
+        imulq   %r13, %rbx ;                                               \
+        imulq   %rcx, %r13 ;                                               \
+        addq    %rbx, %r12 ;                                               \
+        addq    %rdi, %r13 ;                                               \
+        sarq    $0x14, %r12 ;                                              \
+        sarq    $0x14, %r13 ;                                              \
+        movq    %r12, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        movq    %r13, %rcx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %r8 ;                                          \
+        leaq    (%rcx,%rax), %r10 ;                                         \
+        shlq    $0x16, %r8 ;                                               \
+        shlq    $0x16, %r10 ;                                              \
+        sarq    $0x2b, %r8 ;                                               \
+        sarq    $0x2b, %r10 ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %r15 ;                                         \
+        leaq    (%rcx,%rax), %r11 ;                                         \
+        sarq    $0x2a, %r15 ;                                              \
+        sarq    $0x2a, %r11 ;                                              \
+        movq    %r13, %rbx ;                                               \
+        movq    %r12, %rcx ;                                               \
+        imulq   %r8, %r12 ;                                                \
+        imulq   %r15, %rbx ;                                               \
+        addq    %rbx, %r12 ;                                               \
+        imulq   %r11, %r13 ;                                               \
+        imulq   %r10, %rcx ;                                               \
+        addq    %rcx, %r13 ;                                               \
+        sarq    $0x14, %r12 ;                                              \
+        sarq    $0x14, %r13 ;                                              \
+        movq    %r12, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        movq    %r13, %rcx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    MAT(%rsp), %rax ;                                         \
+        imulq   %r8, %rax ;                                                \
+        movq    MAT+0x10(%rsp), %rdx ;                                    \
+        imulq   %r15, %rdx ;                                               \
+        imulq   MAT+0x8(%rsp), %r8 ;                                      \
+        imulq   MAT+0x18(%rsp), %r15 ;                                    \
+        addq    %r8, %r15 ;                                                \
+        leaq    (%rax,%rdx), %r9 ;                                          \
+        movq    MAT(%rsp), %rax ;                                         \
+        imulq   %r10, %rax ;                                               \
+        movq    MAT+0x10(%rsp), %rdx ;                                    \
+        imulq   %r11, %rdx ;                                               \
+        imulq   MAT+0x8(%rsp), %r10 ;                                     \
+        imulq   MAT+0x18(%rsp), %r11 ;                                    \
+        addq    %r10, %r11 ;                                               \
+        leaq    (%rax,%rdx), %r13 ;                                         \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %r8 ;                                          \
+        leaq    (%rcx,%rax), %r12 ;                                         \
+        shlq    $0x15, %r8 ;                                               \
+        shlq    $0x15, %r12 ;                                              \
+        sarq    $0x2b, %r8 ;                                               \
+        sarq    $0x2b, %r12 ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %r10 ;                                         \
+        leaq    (%rcx,%rax), %r14 ;                                         \
+        sarq    $0x2b, %r10 ;                                              \
+        sarq    $0x2b, %r14 ;                                              \
+        movq    %r9, %rax ;                                                \
+        imulq   %r8, %rax ;                                                \
+        movq    %r13, %rdx ;                                               \
+        imulq   %r10, %rdx ;                                               \
+        imulq   %r15, %r8 ;                                                \
+        imulq   %r11, %r10 ;                                               \
+        addq    %r8, %r10 ;                                                \
+        leaq    (%rax,%rdx), %r8 ;                                          \
+        movq    %r9, %rax ;                                                \
+        imulq   %r12, %rax ;                                               \
+        movq    %r13, %rdx ;                                               \
+        imulq   %r14, %rdx ;                                               \
+        imulq   %r15, %r12 ;                                               \
+        imulq   %r11, %r14 ;                                               \
+        addq    %r12, %r14 ;                                               \
+        leaq    (%rax,%rdx), %r12
+
+S2N_BN_SYMBOL(bignum_inv_p25519):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room for temporaries
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Save the return pointer for the end so we can overwrite %rdi later
+
+        movq    %rdi, res
+
+// Copy the input and the prime into the main f and g variables.
+// Make sure x is reduced so that g <= f as assumed in the bound proof.
+
+        xorl    %eax, %eax
+        leaq    -19(%rax), %rcx
+        notq    %rax
+        movq    %rcx, F(%rsp)
+        movq    %rax, F+8(%rsp)
+        movq    %rax, F+16(%rsp)
+        btr     $63, %rax
+        movq    %rax, F+24(%rsp)
+
+        movq    (%rsi), %rdx
+        movq    0x8(%rsi), %rcx
+        movq    0x10(%rsi), %r8
+        movq    0x18(%rsi), %r9
+        movl    $0x1, %eax
+        xorl    %r10d, %r10d
+        bts     $0x3f, %r9
+        adcq    %r10, %rax
+        imulq   $19, %rax, %rax
+        addq    %rax, %rdx
+        adcq    %r10, %rcx
+        adcq    %r10, %r8
+        adcq    %r10, %r9
+        movl    $0x13, %eax
+        cmovbq  %r10, %rax
+        subq    %rax, %rdx
+        sbbq    %r10, %rcx
+        sbbq    %r10, %r8
+        sbbq    %r10, %r9
+        btr     $0x3f, %r9
+        movq    %rdx, G(%rsp)
+        movq    %rcx, G+0x8(%rsp)
+        movq    %r8, G+0x10(%rsp)
+        movq    %r9, G+0x18(%rsp)
+
+// Also maintain weakly reduced < 2*p_25519 vector [u,v] such that
+// [f,g] == x * 2^{590-59*i} * [u,v] (mod p_25519)
+// starting with [p_25519,x] == x * 2^{590-59*0} * [0,2^-590] (mod p_25519)
+
+        xorl    %eax, %eax
+        movq    %rax, U(%rsp)
+        movq    %rax, U+8(%rsp)
+        movq    %rax, U+16(%rsp)
+        movq    %rax, U+24(%rsp)
+
+        movq    $0xa0f99e2375022099, %rax
+        movq    %rax, V(%rsp)
+        movq    $0xa8c68f3f1d132595, %rax
+        movq    %rax, V+8(%rsp)
+        movq    $0x6c6c893805ac5242, %rax
+        movq    %rax, V+16(%rsp)
+        movq    $0x276508b241770615, %rax
+        movq    %rax, V+24(%rsp)
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special tenth iteration after a uniform
+// first 9.
+
+        movq    $10, i
+        movq    $1, d
+        jmp     bignum_inv_p25519_midloop
+
+bignum_inv_p25519_loop:
+
+// Separate out the matrix into sign-magnitude pairs
+
+        movq    %r8, %r9
+        sarq    $63, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+
+        movq    %r10, %r11
+        sarq    $63, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+
+        movq    %r12, %r13
+        sarq    $63, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+
+        movq    %r14, %r15
+        sarq    $63, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in temporary storage for the [u,v] part and do [f,g] first.
+
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %rdi
+        andq    %r11, %rdi
+        addq    %rax, %rdi
+        movq    %rdi, tmp
+
+        movq    %r12, %rax
+        andq    %r13, %rax
+        movq    %r14, %rsi
+        andq    %r15, %rsi
+        addq    %rax, %rsi
+        movq    %rsi, tmp2
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        xorl    %ebx, %ebx
+        movq    F(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    G(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+
+        xorl    %ebp, %ebp
+        movq    F(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    G(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+
+// Digit 1 of [f,g]
+
+        xorl    %ecx, %ecx
+        movq    F+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    G+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        shrdq   $59, %rbx, %rdi
+        movq    %rdi, F(%rsp)
+
+        xorl    %edi, %edi
+        movq    F+N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        movq    G+N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        shrdq   $59, %rbp, %rsi
+        movq    %rsi, G(%rsp)
+
+// Digit 2 of [f,g]
+
+        xorl    %esi, %esi
+        movq    F+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        movq    G+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        shrdq   $59, %rcx, %rbx
+        movq    %rbx, F+N(%rsp)
+
+        xorl    %ebx, %ebx
+        movq    F+2*N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    G+2*N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        shrdq   $59, %rdi, %rbp
+        movq    %rbp, G+N(%rsp)
+
+// Digits 3 and 4 of [f,g]
+
+        movq    F+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        movq    %rax, %rbp
+        sarq    $63, %rbp
+        andq    %r8, %rbp
+        negq    %rbp
+        mulq    %r8
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    G+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %rax, %rdx
+        sarq    $63, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbp
+        mulq    %r10
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        shrdq   $59, %rsi, %rcx
+        movq    %rcx, F+2*N(%rsp)
+        shrdq   $59, %rbp, %rsi
+
+        movq    F+3*N(%rsp), %rax
+        movq    %rsi, F+3*N(%rsp)
+
+        xorq    %r13, %rax
+        movq    %rax, %rsi
+        sarq    $63, %rsi
+        andq    %r12, %rsi
+        negq    %rsi
+        mulq    %r12
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        movq    G+3*N(%rsp), %rax
+        xorq    %r15, %rax
+        movq    %rax, %rdx
+        sarq    $63, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rsi
+        mulq    %r14
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        shrdq   $59, %rbx, %rdi
+        movq    %rdi, G+2*N(%rsp)
+        shrdq   $59, %rsi, %rbx
+        movq    %rbx, G+3*N(%rsp)
+
+// Get the initial carries back from storage and do the [u,v] accumulation
+
+        movq    tmp, %rbx
+        movq    tmp2, %rbp
+
+// Digit 0 of [u,v]
+
+        xorl    %ecx, %ecx
+        movq    U(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    V(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+
+        xorl    %esi, %esi
+        movq    U(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, U(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    V(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, V(%rsp)
+
+// Digit 1 of [u,v]
+
+        xorl    %ebx, %ebx
+        movq    U+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    V+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+
+        xorl    %ebp, %ebp
+        movq    U+N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rcx, U+N(%rsp)
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    V+N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    %rsi, V+N(%rsp)
+
+// Digit 2 of [u,v]
+
+        xorl    %ecx, %ecx
+        movq    U+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    V+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+
+        xorl    %esi, %esi
+        movq    U+2*N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, U+2*N(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    V+2*N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, V+2*N(%rsp)
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        movq    U+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        movq    %r9, %rbx
+        andq    %r8, %rbx
+        negq    %rbx
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    V+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbx
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rbx, %rdx
+
+// Modular reduction of u
+
+        movq    %rdx, %rbx
+        shldq   $1, %rcx, %rdx
+        sarq    $63, %rbx
+        addq    %rbx, %rdx
+        movl    $19, %eax
+        imulq   %rdx
+        movq    U(%rsp), %r8
+        addq    %rax, %r8
+        movq    %r8, U(%rsp)
+        movq    U+N(%rsp), %r8
+        adcq    %rdx, %r8
+        movq    %r8, U+N(%rsp)
+        movq    U+2*N(%rsp), %r8
+        adcq    %rbx, %r8
+        movq    %r8, U+2*N(%rsp)
+        adcq    %rbx, %rcx
+        shlq    $63, %rax
+        addq    %rax, %rcx
+
+// Preload for last use of old u digit 3
+
+        movq    U+3*N(%rsp), %rax
+        movq    %rcx, U+3*N(%rsp)
+
+// Digits 3 and 4 of v (top is unsigned)
+
+        xorq    %r13, %rax
+        movq    %r13, %rcx
+        andq    %r12, %rcx
+        negq    %rcx
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rcx
+        movq    V+3*N(%rsp), %rax
+        xorq    %r15, %rax
+        movq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rcx
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rcx, %rdx
+
+// Modular reduction of v
+
+        movq    %rdx, %rcx
+        shldq   $1, %rsi, %rdx
+        sarq    $63, %rcx
+        movl    $19, %eax
+        addq    %rcx, %rdx
+        imulq   %rdx
+        movq    V(%rsp), %r8
+        addq    %rax, %r8
+        movq    %r8, V(%rsp)
+        movq    V+N(%rsp), %r8
+        adcq    %rdx, %r8
+        movq    %r8, V+N(%rsp)
+        movq    V+2*N(%rsp), %r8
+        adcq    %rcx, %r8
+        movq    %r8, V+2*N(%rsp)
+        adcq    %rcx, %rsi
+        shlq    $63, %rax
+        addq    %rax, %rsi
+        movq    %rsi, V+3*N(%rsp)
+
+bignum_inv_p25519_midloop:
+
+        divstep59(d,ff,gg)
+        movq    %rsi, d
+
+// Next iteration
+
+        decq    i
+        jnz     bignum_inv_p25519_loop
+
+// The 10th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        movq    F(%rsp), %rax
+        movq    G(%rsp), %rcx
+        imulq   %r8, %rax
+        imulq   %r10, %rcx
+        addq    %rcx, %rax
+        sarq    $63, %rax
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * [u,v] (mod p_25519)
+// we want to flip the sign of u according to that of f.
+
+        movq    %r8, %r9
+        sarq    $63, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        xorq    %rax, %r9
+
+        movq    %r10, %r11
+        sarq    $63, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        xorq    %rax, %r11
+
+        movq    %r12, %r13
+        sarq    $63, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        xorq    %rax, %r13
+
+        movq    %r14, %r15
+        sarq    $63, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        xorq    %rax, %r15
+
+// Adjust the initial value to allow for complement instead of negation
+
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %r12
+        andq    %r11, %r12
+        addq    %rax, %r12
+
+// Digit 0 of [u]
+
+        xorl    %r13d, %r13d
+        movq    U(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        movq    V(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+
+// Digit 1 of [u]
+
+        xorl    %r14d, %r14d
+        movq    U+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        movq    V+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+
+// Digit 2 of [u]
+
+        xorl    %r15d, %r15d
+        movq    U+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    V+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        movq    U+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        andq    %r8, %r9
+        negq    %r9
+        mulq    %r8
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    V+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %r9
+        mulq    %r10
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+
+// Modular reduction of u, this time strictly 2^255-19.
+
+        movq    %r9, %rax
+        shldq   $1, %r15, %rax
+        sarq    $63, %r9
+        movl    $19, %ebx
+        leaq    1(%rax,%r9,1), %rax
+        imulq   %rbx
+        xorl    %ebp, %ebp
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r9, %r14
+        adcq    %r9, %r15
+        shlq    $63, %rax
+        addq    %rax, %r15
+        cmovns  %rbp, %rbx
+        subq    %rbx, %r12
+        sbbq    %rbp, %r13
+        sbbq    %rbp, %r14
+        sbbq    %rbp, %r15
+        btr     $0x3f, %r15
+
+// Store it back to the final output
+
+        movq    res, %rdi
+        movq    %r12, (%rdi)
+        movq    %r13, N(%rdi)
+        movq    %r14, 2*N(%rdi)
+        movq    %r15, 3*N(%rdi)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519.S
new file mode 100644
index 00000000000..4c85691c433
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519.S
@@ -0,0 +1,594 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Inverse square root modulo p_25519 = 2^255 - 19
+// Input x[4]; output function return (Legendre symbol) and z[4]
+//
+// extern int64_t bignum_invsqrt_p25519(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Given a 4-digit input x, returns a modular inverse square root mod p_25519,
+// i.e. a z such that x * z^2 == 1 (mod p_25519), whenever one exists. The
+// inverse square root z is chosen so that its LSB is even (note that p_25519-z
+// is another possibility). The function return is the Legendre/Jacobi symbol
+// (x//p_25519), which indicates whether indeed x has a modular inverse square
+// root and hence whether the result is meaningful:
+//
+//   0: x is divisible by p_25519 so trivially there is no inverse square root
+//  +1: x is coprime to p_25519 and z is indeed an inverse square root
+//  -1: x is coprime to p_25519 but there is no (inverse or direct) square root
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_invsqrt_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_invsqrt_p25519)
+        .text
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define a 0(%rsp)
+#define b (4*N)(%rsp)
+#define s (8*N)(%rsp)
+#define t (12*N)(%rsp)
+#define u (16*N)(%rsp)
+#define res  (20*N)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (22*N)
+
+// Corrupted versions when stack is down 8 more
+
+#define u8 (17*N)(%rsp)
+
+// Syntactic variants to make x86_att version simpler to generate
+
+#define A 0
+#define B (4*N)
+#define S (8*N)
+#define T (12*N)
+#define U (16*N)
+#define U8 (17*N)
+
+S2N_BN_SYMBOL(bignum_invsqrt_p25519):
+        _CET_ENDBR
+
+// In this case the Windows form literally makes a subroutine call.
+// This avoids hassle arising from subroutine offsets
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        callq   bignum_invsqrt_p25519_standard
+        popq    %rsi
+        popq    %rdi
+        ret
+
+bignum_invsqrt_p25519_standard:
+#endif
+
+// Save registers and make room for temporaries
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Save the return pointer for the end so we can overwrite %rdi later
+
+        movq    %rdi, res
+
+// Set up reduced version of the input argument a = x mod p_25519. Then
+// get the candidate inverse square root s = a^{252-3}
+
+        movq    (%rsi), %rdx
+        movq    0x8(%rsi), %rcx
+        movq    0x10(%rsi), %r8
+        movq    0x18(%rsi), %r9
+        movl    $0x1, %eax
+        xorl    %r10d, %r10d
+        bts     $0x3f, %r9
+        adcq    %r10, %rax
+        imulq   $0x13, %rax, %rax
+        addq    %rax, %rdx
+        adcq    %r10, %rcx
+        adcq    %r10, %r8
+        adcq    %r10, %r9
+        movl    $0x13, %eax
+        cmovbq  %r10, %rax
+        subq    %rax, %rdx
+        sbbq    %r10, %rcx
+        sbbq    %r10, %r8
+        sbbq    %r10, %r9
+        btr     $0x3f, %r9
+        movq    %rdx, A(%rsp)
+        movq    %rcx, A+0x8(%rsp)
+        movq    %r8, A+0x10(%rsp)
+        movq    %r9, A+0x18(%rsp)
+
+  // Power 2^2 - 1 = 3
+
+        leaq    T(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    T(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^4 - 1 = 15
+
+        leaq    S(%rsp), %rdi
+        movq    $2, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^5 - 1 = 31
+
+        leaq    S(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^10 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $5, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^20 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $10, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^25 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $5, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^50 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $25, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^100 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $50, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^125 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $25, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^250 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $125, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+  // Power 2^252 - 3
+
+        leaq    S(%rsp), %rdi
+        movq    $2, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    S(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+// s = a^{2^252-3} is now one candidate inverse square root.
+// Generate the other one t = s * j_25519 where j_25519 = sqrt(-1)
+
+        movq    $0xc4ee1b274a0ea0b0, %rax
+        movq    %rax, T(%rsp)
+        movq    $0x2f431806ad2fe478, %rax
+        movq    %rax, T+8(%rsp)
+        movq    $0x2b4d00993dfbd7a7, %rax
+        movq    %rax, T+16(%rsp)
+        movq    $0x2b8324804fc1df0b, %rax
+        movq    %rax, T+24(%rsp)
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+// Now multiplex between them according to whether a * s^2 = 1
+
+        leaq    B(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    S(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    A(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+        movq    B(%rsp), %rax
+        xorq    $1, %rax
+        movq    B+8(%rsp), %rbx
+        orq     %rbx, %rax
+        movq    B+16(%rsp), %rcx
+        movq    B+24(%rsp), %rdx
+        orq     %rdx, %rcx
+        orq     %rcx, %rax
+
+        movq    S(%rsp), %rax
+        movq    T(%rsp), %rbx
+        cmovnzq %rbx, %rax
+        movq    S+8(%rsp), %rbx
+        movq    T+8(%rsp), %rcx
+        cmovnzq %rcx, %rbx
+        movq    S+16(%rsp), %rcx
+        movq    T+16(%rsp), %rdx
+        cmovnzq %rdx, %rcx
+        movq    S+24(%rsp), %rbp
+        movq    T+24(%rsp), %rdx
+        cmovnzq %rdx, %rbp
+
+// For definiteness, choose "positive" (LSB=0) inverse square root
+
+        xorl    %edx, %edx
+        leaq    -19(%rdx), %r8
+        leaq    -1(%rdx), %r11
+        movq    %r11, %r9
+        movq    %r11, %r10
+        btr     $63, %r11
+
+        subq    %rax, %r8
+        sbbq    %rbx, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rbp, %r11
+
+        movq    res, %rdx
+        testq   $1, %rax
+        cmovnzq %r8, %rax
+        movq    %rax, (%rdx)
+        cmovnzq %r9, %rbx
+        movq    %rbx, 8(%rdx)
+        cmovnzq %r10, %rcx
+        movq    %rcx, 16(%rdx)
+        cmovnzq %r11, %rbp
+        movq    %rbp, 24(%rdx)
+
+// Determine if it is is indeed an inverse square root, also distinguishing
+// the degenerate x * z^2 == 0 (mod p_25519) case, which is equivalent to
+// x == 0 (mod p_25519). Hence return the Legendre-Jacobi symbol as required.
+
+        leaq    B(%rsp), %rdi
+        movq    $1, %rsi
+        callq   bignum_invsqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    A(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_mul_p25519
+
+        movq    $1, %rax
+        movq    B(%rsp), %rbp
+        xorq    %rbp, %rax
+        movq    B+8(%rsp), %rbx
+        orq     %rbx, %rax
+        orq     %rbx, %rbp
+        movq    B+16(%rsp), %rcx
+        movq    B+24(%rsp), %rdx
+        orq     %rdx, %rcx
+        orq     %rcx, %rax
+        orq     %rcx, %rbp
+
+        negq    %rax
+        sbbq    %rax, %rax
+        leaq    1(%rax,%rax,1), %rax
+
+        testq   %rbp, %rbp
+        cmovzq  %rbp, %rax
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+// *************************************************************
+// Local z = x * y
+// *************************************************************
+
+bignum_invsqrt_p25519_mul_p25519:
+        movq    %rdx, %rcx
+        xorl    %ebp, %ebp
+        movq    (%rcx), %rdx
+        mulxq   (%rsi), %r8, %r9
+        mulxq   0x8(%rsi), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x10(%rsi), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x18(%rsi), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rbp, %r12
+        xorl    %ebp, %ebp
+        movq    0x8(%rcx), %rdx
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsi), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rbp, %r13
+        adcq    %rbp, %r13
+        xorl    %ebp, %ebp
+        movq    0x10(%rcx), %rdx
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x18(%rsi), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rbp, %r14
+        adcq    %rbp, %r14
+        xorl    %ebp, %ebp
+        movq    0x18(%rcx), %rdx
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsi), %rcx, %r15
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        movl    $0x26, %edx
+        mulxq   %r15, %rax, %rbx
+        adcxq   %rcx, %r14
+        adoxq   %rbp, %r15
+        adcq    %rbp, %r15
+        addq    %r11, %rax
+        adcq    %rbp, %rbx
+        btq     $0x3f, %rax
+        adcq    %rbx, %rbx
+        leaq    0x1(%rbx), %rcx
+        imulq   $0x13, %rcx, %rcx
+        xorl    %ebp, %ebp
+        adoxq   %rcx, %r8
+        mulxq   %r12, %rax, %rbx
+        adcxq   %rax, %r8
+        adoxq   %rbx, %r9
+        mulxq   %r13, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r14, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   %r15, %rax, %rbx
+        adcq    %rax, %r11
+        shlq    $0x3f, %rcx
+        cmpq    %rcx, %r11
+        movl    $0x13, %eax
+        cmovns  %rbp, %rax
+        subq    %rax, %r8
+        sbbq    %rbp, %r9
+        sbbq    %rbp, %r10
+        sbbq    %rbp, %r11
+        btr     $0x3f, %r11
+        movq    %r8, (%rdi)
+        movq    %r9, 0x8(%rdi)
+        movq    %r10, 0x10(%rdi)
+        movq    %r11, 0x18(%rdi)
+        ret
+
+// *************************************************************
+// Local z = 2^n * x
+// *************************************************************
+
+bignum_invsqrt_p25519_nsqr_p25519:
+
+// Copy input argument into u
+
+        movq    (%rdx), %rax
+        movq    8(%rdx), %rbx
+        movq    16(%rdx), %rcx
+        movq    24(%rdx), %rdx
+        movq    %rax, U8(%rsp)
+        movq    %rbx, U8+8(%rsp)
+        movq    %rcx, U8+16(%rsp)
+        movq    %rdx, U8+24(%rsp)
+
+// Main squaring loop, accumulating in u consistently  and
+// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38
+
+bignum_invsqrt_p25519_loop:
+        movq   U8(%rsp), %rdx
+        mulxq  %rdx, %r8, %r15
+        mulxq  U8+0x8(%rsp), %r9, %r10
+        mulxq  U8+0x18(%rsp), %r11, %r12
+        movq   U8+0x10(%rsp), %rdx
+        mulxq  U8+0x18(%rsp), %r13, %r14
+        xorl   %ebx, %ebx
+        mulxq  U8(%rsp), %rax, %rcx
+        adcxq  %rax, %r10
+        adoxq  %rcx, %r11
+        mulxq  U8+0x8(%rsp), %rax, %rcx
+        adcxq  %rax, %r11
+        adoxq  %rcx, %r12
+        movq   U8+0x18(%rsp), %rdx
+        mulxq  U8+0x8(%rsp), %rax, %rcx
+        adcxq  %rax, %r12
+        adoxq  %rcx, %r13
+        adcxq  %rbx, %r13
+        adoxq  %rbx, %r14
+        adcq   %rbx, %r14
+        xorl   %ebx, %ebx
+        adcxq  %r9, %r9
+        adoxq  %r15, %r9
+        movq   U8+0x8(%rsp), %rdx
+        mulxq  %rdx, %rax, %rdx
+        adcxq  %r10, %r10
+        adoxq  %rax, %r10
+        adcxq  %r11, %r11
+        adoxq  %rdx, %r11
+        movq   U8+0x10(%rsp), %rdx
+        mulxq  %rdx, %rax, %rdx
+        adcxq  %r12, %r12
+        adoxq  %rax, %r12
+        adcxq  %r13, %r13
+        adoxq  %rdx, %r13
+        movq   U8+0x18(%rsp), %rdx
+        mulxq  %rdx, %rax, %r15
+        adcxq  %r14, %r14
+        adoxq  %rax, %r14
+        adcxq  %rbx, %r15
+        adoxq  %rbx, %r15
+        movl   $0x26, %edx
+        xorl   %ebx, %ebx
+        mulxq  %r12, %rax, %rcx
+        adcxq  %rax, %r8
+        adoxq  %rcx, %r9
+        mulxq  %r13, %rax, %rcx
+        adcxq  %rax, %r9
+        adoxq  %rcx, %r10
+        mulxq  %r14, %rax, %rcx
+        adcxq  %rax, %r10
+        adoxq  %rcx, %r11
+        mulxq  %r15, %rax, %r12
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        adcxq  %rbx, %r12
+        shldq  $0x1, %r11, %r12
+        btr    $0x3f, %r11
+        movl   $0x13, %edx
+        imulq  %r12, %rdx
+        addq   %rdx, %r8
+        adcq   %rbx, %r9
+        adcq   %rbx, %r10
+        adcq   %rbx, %r11
+        movq   %r8, U8(%rsp)
+        movq   %r9, U8+0x8(%rsp)
+        movq   %r10, U8+0x10(%rsp)
+        movq   %r11, U8+0x18(%rsp)
+
+// Loop as applicable
+
+        decq    %rsi
+        jnz     bignum_invsqrt_p25519_loop
+
+// We know the intermediate result x < 2^256 - 38, and now we do strict
+// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255
+// which is equivalent to a "ns" condition. We just use the results where
+// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them.
+
+        movl    $19, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+
+        cmovns  %r8, %rax
+        cmovns  %r9, %rbx
+        cmovns  %r10, %rcx
+        cmovns  %r11, %rdx
+        btr     $63, %rdx
+        movq    %rax, (%rdi)
+        movq    %rbx, 8(%rdi)
+        movq    %rcx, 16(%rdi)
+        movq    %rdx, 24(%rdi)
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519_alt.S
new file mode 100644
index 00000000000..78bcb3e5cfb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519_alt.S
@@ -0,0 +1,675 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Inverse square root modulo p_25519 = 2^255 - 19
+// Input x[4]; output function return (Legendre symbol) and z[4]
+//
+// extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Given a 4-digit input x, returns a modular inverse square root mod p_25519,
+// i.e. a z such that x * z^2 == 1 (mod p_25519), whenever one exists. The
+// inverse square root z is chosen so that its LSB is even (note that p_25519-z
+// is another possibility). The function return is the Legendre/Jacobi symbol
+// (x//p_25519), which indicates whether indeed x has a modular inverse square
+// root and hence whether the result is meaningful:
+//
+//   0: x is divisible by p_25519 so trivially there is no inverse square root
+//  +1: x is coprime to p_25519 and z is indeed an inverse square root
+//  -1: x is coprime to p_25519 but there is no (inverse or direct) square root
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_invsqrt_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_invsqrt_p25519_alt)
+        .text
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define a 0(%rsp)
+#define b (4*N)(%rsp)
+#define s (8*N)(%rsp)
+#define t (12*N)(%rsp)
+#define u (16*N)(%rsp)
+#define res  (20*N)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (22*N)
+
+// Corrupted versions when stack is down 8 more
+
+#define u8 (17*N)(%rsp)
+
+// Syntactic variants to make x86_att version simpler to generate
+
+#define A 0
+#define B (4*N)
+#define S (8*N)
+#define T (12*N)
+#define U (16*N)
+#define U8 (17*N)
+
+S2N_BN_SYMBOL(bignum_invsqrt_p25519_alt):
+        _CET_ENDBR
+
+// In this case the Windows form literally makes a subroutine call.
+// This avoids hassle arising from subroutine offsets
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        callq   bignum_invsqrt_p25519_alt_standard
+        popq    %rsi
+        popq    %rdi
+        ret
+
+bignum_invsqrt_p25519_alt_standard:
+#endif
+
+// Save registers and make room for temporaries
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Save the return pointer for the end so we can overwrite %rdi later
+
+        movq    %rdi, res
+
+// Set up reduced version of the input argument a = x mod p_25519. Then
+// get the candidate inverse square root s = a^{252-3}
+
+        movq    (%rsi), %rdx
+        movq    0x8(%rsi), %rcx
+        movq    0x10(%rsi), %r8
+        movq    0x18(%rsi), %r9
+        movl    $0x1, %eax
+        xorl    %r10d, %r10d
+        bts     $0x3f, %r9
+        adcq    %r10, %rax
+        imulq   $0x13, %rax, %rax
+        addq    %rax, %rdx
+        adcq    %r10, %rcx
+        adcq    %r10, %r8
+        adcq    %r10, %r9
+        movl    $0x13, %eax
+        cmovbq  %r10, %rax
+        subq    %rax, %rdx
+        sbbq    %r10, %rcx
+        sbbq    %r10, %r8
+        sbbq    %r10, %r9
+        btr     $0x3f, %r9
+        movq    %rdx, A(%rsp)
+        movq    %rcx, A+0x8(%rsp)
+        movq    %r8, A+0x10(%rsp)
+        movq    %r9, A+0x18(%rsp)
+
+  // Power 2^2 - 1 = 3
+
+        leaq    T(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    T(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^4 - 1 = 15
+
+        leaq    S(%rsp), %rdi
+        movq    $2, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^5 - 1 = 31
+
+        leaq    S(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^10 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $5, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^20 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $10, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^25 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $5, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^50 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $25, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^100 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $50, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^125 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $25, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^250 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $125, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+  // Power 2^252 - 3
+
+        leaq    S(%rsp), %rdi
+        movq    $2, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    S(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+// s = a^{2^252-3} is now one candidate inverse square root.
+// Generate the other one t = s * j_25519 where j_25519 = sqrt(-1)
+
+        movq    $0xc4ee1b274a0ea0b0, %rax
+        movq    %rax, T(%rsp)
+        movq    $0x2f431806ad2fe478, %rax
+        movq    %rax, T+8(%rsp)
+        movq    $0x2b4d00993dfbd7a7, %rax
+        movq    %rax, T+16(%rsp)
+        movq    $0x2b8324804fc1df0b, %rax
+        movq    %rax, T+24(%rsp)
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+// Now multiplex between them according to whether a * s^2 = 1
+
+        leaq    B(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    S(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    A(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+        movq    B(%rsp), %rax
+        xorq    $1, %rax
+        movq    B+8(%rsp), %rbx
+        orq     %rbx, %rax
+        movq    B+16(%rsp), %rcx
+        movq    B+24(%rsp), %rdx
+        orq     %rdx, %rcx
+        orq     %rcx, %rax
+
+        movq    S(%rsp), %rax
+        movq    T(%rsp), %rbx
+        cmovnzq %rbx, %rax
+        movq    S+8(%rsp), %rbx
+        movq    T+8(%rsp), %rcx
+        cmovnzq %rcx, %rbx
+        movq    S+16(%rsp), %rcx
+        movq    T+16(%rsp), %rdx
+        cmovnzq %rdx, %rcx
+        movq    S+24(%rsp), %rbp
+        movq    T+24(%rsp), %rdx
+        cmovnzq %rdx, %rbp
+
+// For definiteness, choose "positive" (LSB=0) inverse square root
+
+        xorl    %edx, %edx
+        leaq    -19(%rdx), %r8
+        leaq    -1(%rdx), %r11
+        movq    %r11, %r9
+        movq    %r11, %r10
+        btr     $63, %r11
+
+        subq    %rax, %r8
+        sbbq    %rbx, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rbp, %r11
+
+        movq    res, %rdx
+        testq   $1, %rax
+        cmovnzq %r8, %rax
+        movq    %rax, (%rdx)
+        cmovnzq %r9, %rbx
+        movq    %rbx, 8(%rdx)
+        cmovnzq %r10, %rcx
+        movq    %rcx, 16(%rdx)
+        cmovnzq %r11, %rbp
+        movq    %rbp, 24(%rdx)
+
+// Determine if it is is indeed an inverse square root, also distinguishing
+// the degenerate x * z^2 == 0 (mod p_25519) case, which is equivalent to
+// x == 0 (mod p_25519). Hence return the Legendre-Jacobi symbol as required.
+
+        leaq    B(%rsp), %rdi
+        movq    $1, %rsi
+        callq   bignum_invsqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    A(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_invsqrt_p25519_alt_mul_p25519
+
+        movq    $1, %rax
+        movq    B(%rsp), %rbp
+        xorq    %rbp, %rax
+        movq    B+8(%rsp), %rbx
+        orq     %rbx, %rax
+        orq     %rbx, %rbp
+        movq    B+16(%rsp), %rcx
+        movq    B+24(%rsp), %rdx
+        orq     %rdx, %rcx
+        orq     %rcx, %rax
+        orq     %rcx, %rbp
+
+        negq    %rax
+        sbbq    %rax, %rax
+        leaq    1(%rax,%rax,1), %rax
+
+        testq   %rbp, %rbp
+        cmovzq  %rbp, %rax
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+// *************************************************************
+// Local z = x * y
+// *************************************************************
+
+bignum_invsqrt_p25519_alt_mul_p25519:
+        movq    %rdx, %rcx
+        movq    (%rsi), %rax
+        mulq     (%rcx)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    (%rsi), %rax
+        mulq     0x8(%rcx)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x8(%rsi), %rax
+        mulq     (%rcx)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    $0x0, %r11
+        xorq    %r12, %r12
+        movq    (%rsi), %rax
+        mulq     0x10(%rcx)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x8(%rsi), %rax
+        mulq     0x8(%rcx)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x10(%rsi), %rax
+        mulq     (%rcx)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    (%rsi), %rax
+        mulq     0x18(%rcx)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x8(%rsi), %rax
+        mulq     0x10(%rcx)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x10(%rsi), %rax
+        mulq     0x8(%rcx)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x18(%rsi), %rax
+        mulq     (%rcx)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x8(%rsi), %rax
+        mulq     0x18(%rcx)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rsi), %rax
+        mulq     0x10(%rcx)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x18(%rsi), %rax
+        mulq     0x8(%rcx)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x10(%rsi), %rax
+        mulq     0x18(%rcx)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rsi), %rax
+        mulq     0x10(%rcx)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x18(%rsi), %rax
+        mulq     0x18(%rcx)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movl    $0x26, %esi
+        movq    %r12, %rax
+        mulq    %rsi
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %rcx, %rcx
+        movq    %r13, %rax
+        mulq    %rsi
+        subq    %rcx, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r14, %rax
+        mulq    %rsi
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    %r15, %rax
+        mulq    %rsi
+        subq    %rcx, %rdx
+        xorq    %rcx, %rcx
+        addq    %rax, %r11
+        movq    %rdx, %r12
+        adcq    %rcx, %r12
+        shldq   $0x1, %r11, %r12
+        leaq    0x1(%r12), %rax
+        movl    $0x13, %esi
+        bts     $0x3f, %r11
+        imulq   %rsi, %rax
+        addq    %rax, %r8
+        adcq    %rcx, %r9
+        adcq    %rcx, %r10
+        adcq    %rcx, %r11
+        sbbq    %rax, %rax
+        notq    %rax
+        andq    %rsi, %rax
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rcx, %r11
+        btr     $0x3f, %r11
+        movq    %r8, (%rdi)
+        movq    %r9, 0x8(%rdi)
+        movq    %r10, 0x10(%rdi)
+        movq    %r11, 0x18(%rdi)
+        ret
+
+// *************************************************************
+// Local z = 2^n * x
+// *************************************************************
+
+bignum_invsqrt_p25519_alt_nsqr_p25519:
+
+// Copy input argument into u
+
+        movq    (%rdx), %rax
+        movq    8(%rdx), %rbx
+        movq    16(%rdx), %rcx
+        movq    24(%rdx), %rdx
+        movq    %rax, U8(%rsp)
+        movq    %rbx, U8+8(%rsp)
+        movq    %rcx, U8+16(%rsp)
+        movq    %rdx, U8+24(%rsp)
+
+// Main squaring loop, accumulating in u consistently  and
+// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38
+
+bignum_invsqrt_p25519_alt_loop:
+        movq    U8(%rsp), %rax
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    U8(%rsp), %rax
+        mulq     U8+0x8(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r11
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    $0x0, %r11
+        xorq    %r12, %r12
+        movq    U8+0x8(%rsp), %rax
+        mulq    %rax
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    U8(%rsp), %rax
+        mulq     U8+0x10(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r12
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    U8(%rsp), %rax
+        mulq     U8+0x18(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r13
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    U8+0x8(%rsp), %rax
+        mulq     U8+0x10(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r13
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    U8+0x8(%rsp), %rax
+        mulq     U8+0x18(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r14
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    U8+0x10(%rsp), %rax
+        mulq    %rax
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    U8+0x10(%rsp), %rax
+        mulq     U8+0x18(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r15
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    U8+0x18(%rsp), %rax
+        mulq    %rax
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movl    $0x26, %ebx
+        movq    %r12, %rax
+        mulq    %rbx
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %rcx, %rcx
+        movq    %r13, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r14, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    %r15, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        xorq    %rcx, %rcx
+        addq    %rax, %r11
+        movq    %rdx, %r12
+        adcq    %rcx, %r12
+        shldq   $0x1, %r11, %r12
+        btr     $0x3f, %r11
+        movl    $0x13, %edx
+        imulq   %r12, %rdx
+        addq    %rdx, %r8
+        adcq    %rcx, %r9
+        adcq    %rcx, %r10
+        adcq    %rcx, %r11
+        movq    %r8, U8(%rsp)
+        movq    %r9, U8+0x8(%rsp)
+        movq    %r10, U8+0x10(%rsp)
+        movq    %r11, U8+0x18(%rsp)
+
+// Loop as applicable
+
+        decq    %rsi
+        jnz     bignum_invsqrt_p25519_alt_loop
+
+// We know the intermediate result x < 2^256 - 38, and now we do strict
+// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255
+// which is equivalent to a "ns" condition. We just use the results where
+// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them.
+
+        movl    $19, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+
+        cmovns  %r8, %rax
+        cmovns  %r9, %rbx
+        cmovns  %r10, %rcx
+        cmovns  %r11, %rdx
+        btr     $63, %rdx
+        movq    %rax, (%rdi)
+        movq    %rbx, 8(%rdi)
+        movq    %rcx, 16(%rdi)
+        movq    %rdx, 24(%rdi)
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519.S
index 5ec8de2de23..7d5282521fd 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519.S
@@ -95,6 +95,7 @@
         adcq    %rbx, m3
 
 S2N_BN_SYMBOL(bignum_madd_n25519):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519_alt.S
index f264b79c29e..5abdd1377f3 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519_alt.S
@@ -95,6 +95,7 @@
         adcq    %rbx, m3
 
 S2N_BN_SYMBOL(bignum_madd_n25519_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_m25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_m25519_4.S
new file mode 100644
index 00000000000..72aa689aa72
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_m25519_4.S
@@ -0,0 +1,96 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod m_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_m25519_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the group order of curve25519/edwards25519.
+// This is the full group order, 8 * the standard basepoint order.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_m25519_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_m25519_4)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n0 %rax
+#define n1 %r10
+#define n3 %r11
+
+// Can re-use this as a temporary once we've loaded the input
+
+#define c %rsi
+
+S2N_BN_SYMBOL(bignum_mod_m25519_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load a set of registers [n3; 0; n1; n0] = m_25519
+
+        movq    $0xc09318d2e7ae9f68, n0
+        movq    $0xa6f7cef517bce6b2, n1
+        movq    $0x8000000000000000, n3
+
+// Load the input and compute x - m_25519
+
+        movq    (x), d0
+        subq    n0, d0
+        movq    8(x), d1
+        sbbq    n1, d1
+        movq    16(x), d2
+        sbbq    $0, d2
+        movq    24(x), d3
+        sbbq    n3, d3
+
+// Now CF is set iff x < m_25519. Create a mask for that condition and mask
+// the three nontrivial digits ready to undo the previous subtraction with
+// a compensating addition
+
+        sbbq    c, c
+        andq    c, n0
+        andq    c, n1
+        andq    c, n3
+
+// Now add mask * m_25519 again and store
+
+        addq    n0, d0
+        movq    d0, (z)
+        adcq    n1, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/x86_att/curve25519/bignum_mod_n25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519.S
similarity index 93%
rename from third_party/s2n-bignum/x86_att/curve25519/bignum_mod_n25519.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519.S
index 52c45899543..7d402e66919 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/bignum_mod_n25519.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519.S
@@ -35,6 +35,7 @@
 #define q %rbx
 
 S2N_BN_SYMBOL(bignum_mod_n25519):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
@@ -53,7 +54,7 @@ S2N_BN_SYMBOL(bignum_mod_n25519):
 // If the input is already <= 3 words long, go to a trivial "copy" path
 
         cmpq    $4, k
-        jc      shortinput
+        jc      bignum_mod_n25519_shortinput
 
 // Otherwise load the top 4 digits (top-down) and reduce k by 4
 // This [m3;m2;m1;m0] is the initial x where we begin reduction.
@@ -119,7 +120,7 @@ S2N_BN_SYMBOL(bignum_mod_n25519):
 // estimation process.
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_n25519_writeback
 
 bignum_mod_n25519_loop:
 
@@ -187,7 +188,7 @@ bignum_mod_n25519_loop:
 
 // Write back
 
-writeback:
+bignum_mod_n25519_writeback:
 
         movq    m0, (z)
         movq    m1, 8(z)
@@ -205,7 +206,7 @@ writeback:
 #endif
         ret
 
-shortinput:
+bignum_mod_n25519_shortinput:
 
         xorq    m0, m0
         xorq    m1, m1
@@ -213,15 +214,15 @@ shortinput:
         xorq    m3, m3
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_n25519_writeback
         movq    (%rdx), m0
         decq    k
-        jz      writeback
+        jz      bignum_mod_n25519_writeback
         movq    8(%rdx), m1
         decq    k
-        jz      writeback
+        jz      bignum_mod_n25519_writeback
         movq    16(%rdx), m2
-        jmp     writeback
+        jmp     bignum_mod_n25519_writeback
 
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519_4.S
new file mode 100644
index 00000000000..11ca27133a3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519_4.S
@@ -0,0 +1,109 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo basepoint order, z := x mod n_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_n25519_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the order of the curve25519/edwards25519 basepoint.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n25519_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n25519_4)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define q %rcx
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+S2N_BN_SYMBOL(bignum_mod_n25519_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the top digit first, get the quotient estimate q = floor(x/2^252)
+// and delete it from that digit, in effect doing x' = x - q * 2^252.
+// Now we only need x' - q * n_25519' where n_25519' = n_25519 - 2^252
+
+        movq    24(x), q
+        movq    q, d3
+        shrq    $60, q
+        shlq    $4, d3
+        shrq    $4, d3
+
+// Compute [%rdx;d2;d1] = q * n_25519'
+
+        movq    $0x5812631a5cf5d3ed, %rax
+        mulq    q
+        movq    %rax, d1
+        movq    %rdx, d2
+
+        movq    $0x14def9dea2f79cd6, %rax
+        mulq    q
+        addq    %rax, d2
+        adcq    $0, %rdx
+
+// Subtract to get [d3;d2;d1;d0] = x - q * n_25519
+
+        movq    (x), d0
+        subq    d1, d0
+        movq    8(x), d1
+        sbbq    d2, d1
+        movq    16(x), d2
+        sbbq    %rdx, d2
+        sbbq    $0, d3
+
+// Get a bitmask q for the borrow and create masked version of
+// non-trivial digits of [%rcx;0;%rdx;%rax] = n_25519. Note that
+// %rcx = q but we can get it from the corresponding bit of %rax.
+
+        sbbq    q, q
+
+        movq    $0x5812631a5cf5d3ed, %rax
+        andq    q, %rax
+        movq    $0x14def9dea2f79cd6, %rdx
+        andq    q, %rdx
+        movq    $0x1000000000000000, %rcx
+        andq    %rax, %rcx
+
+// Do compensating addition (iff subtraction borrowed) and store
+
+        addq    %rax, d0
+        movq    d0, (z)
+        adcq    %rdx, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    %rcx, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_p25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_p25519_4.S
new file mode 100644
index 00000000000..2618031c6de
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_p25519_4.S
@@ -0,0 +1,97 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_p25519_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p25519_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p25519_4)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+#define c %r10
+
+#define q %rax
+
+#define qshort %eax
+#define cshort %r10d
+
+S2N_BN_SYMBOL(bignum_mod_p25519_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the inputs as [d3;d2;d1;d0]
+
+        movq    (x), d0
+        movq    8(x), d1
+        movq    16(x), d2
+        movq    24(x), d3
+
+// Letting x = 2^255 * h + l where h is the top bit, the provisional quotient
+// is q = h + 1, which is either correct or 1 too high.
+
+        movl    $1, qshort
+        xorl    cshort, cshort
+        bts     $63, d3
+        adcq    c, q
+        imulq   $19, q
+
+// Writing the provisional remainder as r = x - (2^255 - 19) * q we
+// compute r' = (2^255 + l) + 19 * q = r + 2^256
+
+        addq    q, d0
+        adcq    c, d1
+        adcq    c, d2
+        adcq    c, d3
+
+// Now r < 0 <=> r' < 2^256 <=> ~CF and in this case we correct by adding
+// 2^255 - 19, or in fact subtracting 19 and masking to 255 bits.
+
+        movl    $19, qshort
+        cmovcq  c, q
+
+        subq    q, d0
+        sbbq    c, d1
+        sbbq    c, d2
+        sbbq    c, d3
+        btr     $63, d3
+
+// Store the end result
+
+        movq    d0, (z)
+        movq    d1, 8(z)
+        movq    d2, 16(z)
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519.S
new file mode 100644
index 00000000000..9ff1d5ac3dd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519.S
@@ -0,0 +1,202 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply modulo p_25519, z := (x * y) mod p_25519
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_mul_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p25519)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// Copied in or set up
+
+#define y %rcx
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// mulpadd(high,low,m) adds %rdx * m to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rbx as temporaries.
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rbx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rbx, high
+
+// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax as a temporary, assuming high created from scratch
+// and that zero has value zero.
+
+#define mulpade(high,low,m)             \
+        mulxq   m, %rax, high ;           \
+        adcxq   %rax, low ;               \
+        adoxq   zero, high
+
+S2N_BN_SYMBOL(bignum_mul_p25519):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Zero a register, which also makes sure we don't get a fake carry-in
+
+        xorl    zeroe, zeroe
+
+// Do the zeroth row, which is a bit different
+
+        movq    (y), %rdx
+
+        mulxq   (x), %r8, %r9
+        mulxq   8(x), %rax, %r10
+        addq    %rax, %r9
+        mulxq   16(x), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   24(x), %rax, %r12
+        adcq    %rax, %r11
+        adcq    zero, %r12
+
+// Add row 1
+
+        xorl    zeroe, zeroe
+        movq    8(y), %rdx
+        mulpadd(%r10,%r9,(x))
+        mulpadd(%r11,%r10,8(x))
+        mulpadd(%r12,%r11,16(x))
+        mulpade(%r13,%r12,24(x))
+        adcq    zero, %r13
+
+// Add row 2
+
+        xorl    zeroe, zeroe
+        movq    16(y), %rdx
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        mulpadd(%r13,%r12,16(x))
+        mulpade(%r14,%r13,24(x));
+        adcq    zero, %r14
+
+// Add row 3; also use an early 38*r15+r11 to get a quotient estimate q
+// and then squeeze in a 19 * q computation to inject into the next
+// double-carry chain. At the end %rcx = q and %rax = 19 * q.
+
+        xorl    zeroe, zeroe
+        movq    24(y), %rdx
+
+        mulpadd(%r12,%r11,(x))
+
+        mulxq   24(x), %rcx, %r15
+
+        mulpadd(%r13,%r12,8(x))
+        mulpadd(%r14,%r13,16(x))
+
+        movl    $38, %edx
+        mulxq   %r15, %rax, %rbx
+
+        adcxq   %rcx, %r14
+        adoxq   zero, %r15
+        adcq    zero, %r15
+
+        addq    %r11, %rax
+        adcq    zero, %rbx
+        btq     $63, %rax
+        adcq    %rbx, %rbx
+        leaq    1(%rbx), %rcx
+        imulq   $19, %rcx
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// and this is == 38 * h + l (mod p_25519)
+// We add in the precalculated 19 * q as well.
+// This is kept in 4 words since we have enough information there.
+
+        xorl    zeroe, zeroe
+        adoxq   %rcx, %r8
+        mulpadd(%r9,%r8,%r12)
+        mulpadd(%r10,%r9,%r13)
+        mulpadd(%r11,%r10,%r14)
+        mulxq   %r15, %rax, %rbx
+        adcq    %rax, %r11
+
+// We still haven't made the -2^255 * q contribution yet. Since we
+// are now safely in 4 words we just need a single bit of q, and we
+// can actually use the LSB of %rcx = 19 * q since 19 is odd. And we
+// don't literally need to subtract, just to see whether we would
+// have a top 1 bit if we did, meaning we need to correct in the
+// last step by adding 2^255 - 19.
+
+        shlq    $63, %rcx
+        cmpq    %rcx, %r11
+        movl    $19, %eax
+        cmovns  zero, %rax
+
+// Now make that possible correction and finally mask to 255 bits
+
+        subq    %rax, %r8
+        sbbq    zero, %r9
+        sbbq    zero, %r10
+        sbbq    zero, %r11
+        btr     $63, %r11
+
+// Write everything back
+
+        movq    %r8, (z)
+        movq    %r9, 8(z)
+        movq    %r10, 16(z)
+        movq    %r11, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519_alt.S
new file mode 100644
index 00000000000..d339b0196d7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519_alt.S
@@ -0,0 +1,217 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply modulo p_25519, z := (x * y) mod p_25519
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_mul_p25519_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p25519_alt)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// Copied in or set up
+
+#define y %rcx
+
+// Re-use input pointers later for constant and top carry
+
+#define d %rsi
+#define c %rcx
+
+#define dshort %esi
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+S2N_BN_SYMBOL(bignum_mul_p25519_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Start the window as [%r10;%r9;%r8] with 00 product
+
+        movq    (x), %rax
+        mulq     (y)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+
+// Column 1
+
+        xorq    %r11, %r11
+        combads(%r10,%r9,(x),8(y))
+        combadd(%r11,%r10,%r9,8(x),(y))
+
+// Column 2
+
+        xorq    %r12, %r12
+        combadz(%r12,%r11,%r10,(x),16(y))
+        combadd(%r12,%r11,%r10,8(x),8(y))
+        combadd(%r12,%r11,%r10,16(x),(y))
+
+// Column 3
+
+        xorq    %r13, %r13
+        combadz(%r13,%r12,%r11,(x),24(y))
+        combadd(%r13,%r12,%r11,8(x),16(y))
+        combadd(%r13,%r12,%r11,16(x),8(y))
+        combadd(%r13,%r12,%r11,24(x),(y))
+
+// Column 4
+
+        xorq    %r14, %r14
+        combadz(%r14,%r13,%r12,8(x),24(y))
+        combadd(%r14,%r13,%r12,16(x),16(y))
+        combadd(%r14,%r13,%r12,24(x),8(y))
+
+// Column 5
+
+        xorq    %r15, %r15
+        combadz(%r15,%r14,%r13,16(x),24(y))
+        combadd(%r15,%r14,%r13,24(x),16(y))
+
+// Final work for columns 6 and 7
+
+        movq    24(x), %rax
+        mulq     24(y)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// and this is == 38 * h + l (mod p_25519)
+
+        movl    $38, dshort
+
+        movq    %r12, %rax
+        mulq    d
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    c, c
+
+        movq    %r13, %rax
+        mulq    d
+        subq    c, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    c, c
+
+        movq    %r14, %rax
+        mulq    d
+        subq    c, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    c, c
+
+        movq    %r15, %rax
+        mulq    d
+        subq    c, %rdx
+        xorq    c, c
+        addq    %rax, %r11
+        movq    %rdx, %r12
+        adcq    c, %r12
+
+// Now we have reduced to 5 digits, 2^255 * h + l = [%r12,%r11,%r10,%r9,%r8]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+
+        shldq   $1, %r11, %r12
+        leaq    1(%r12), %rax
+        movl    $19, dshort
+        bts     $63, %r11
+        imulq   d, %rax
+        addq    %rax, %r8
+        adcq    c, %r9
+        adcq    c, %r10
+        adcq    c, %r11
+
+// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0]
+// So we correct if CF = 0 by subtracting 19, either way masking to
+// 255 bits, i.e. by effectively adding p_25519 to the "full" answer
+
+        sbbq    %rax, %rax
+        notq    %rax
+        andq    d, %rax
+        subq    %rax, %r8
+        sbbq    c, %r9
+        sbbq    c, %r10
+        sbbq    c, %r11
+        btr     $63, %r11
+
+// Write everything back
+
+        movq    %r8, (z)
+        movq    %r9, 8(z)
+        movq    %r10, 16(z)
+        movq    %r11, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/x86_att/curve25519/bignum_neg_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_neg_p25519.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/bignum_neg_p25519.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_neg_p25519.S
index 5e66073baf6..7b9408f0e8f 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/bignum_neg_p25519.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_neg_p25519.S
@@ -33,6 +33,7 @@
 #define qshort %esi
 
 S2N_BN_SYMBOL(bignum_neg_p25519):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_optneg_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_optneg_p25519.S
new file mode 100644
index 00000000000..61ff47cd297
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_optneg_p25519.S
@@ -0,0 +1,98 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or
+// z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+//
+//    extern void bignum_optneg_p25519
+//     (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = p, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = p, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p25519)
+        .text
+
+#define z %rdi
+#define q %rsi
+#define x %rdx
+
+#define n0 %rax
+#define n1 %rcx
+#define n2 %r8
+#define n3 %r9
+
+S2N_BN_SYMBOL(bignum_optneg_p25519):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Adjust q by zeroing it if the input is zero (to avoid giving -0 = p_25519,
+// which is not strictly reduced even though it's correct modulo p_25519).
+
+        movq    (x), n0
+        orq     8(x), n0
+        movq    16(x), n1
+        orq     24(x), n1
+        orq     n1, n0
+        negq    n0
+        sbbq    n0, n0
+        andq    n0, q
+
+// Turn q into a bitmask, all 1s for q=false, all 0s for q=true
+
+        negq    q
+        sbbq    q, q
+        notq    q
+
+// Let [n3;n2;n1;n0] = if q then p_25519 else -1
+
+        movq    $0xffffffffffffffed, n0
+        orq     q, n0
+        movq    $0xffffffffffffffff, n1
+        movq    n1, n2
+        movq    $0x7fffffffffffffff, n3
+        orq     q, n3
+
+// Subtract so [n3;n2;n1;n0] = if q then p_25519 - x else -1 - x
+
+        subq    (x), n0
+        sbbq    8(x), n1
+        sbbq    16(x), n2
+        sbbq    24(x), n3
+
+// XOR the words with the bitmask, which in the case q = false has the
+// effect of restoring ~(-1 - x) = -(-1 - x) - 1 = 1 + x - 1 = x
+// and write back the digits to the output
+
+        xorq    q, n0
+        movq    n0, (z)
+        xorq    q, n1
+        movq    n1, 8(z)
+        xorq    q, n2
+        movq    n2, 16(z)
+        xorq    q, n3
+        movq    n3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519.S
new file mode 100644
index 00000000000..45a6890dd39
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519.S
@@ -0,0 +1,186 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square modulo p_25519, z := (x^2) mod p_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_sqr_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p25519)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Use this fairly consistently for a zero
+
+#define zero %rbx
+#define zeroe %ebx
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rcx as temporaries
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rcx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rcx, high
+
+// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax as a temporary, assuming high created from scratch
+// and that zero has value zero.
+
+#define mulpade(high,low,m)             \
+        mulxq   m, %rax, high ;           \
+        adcxq   %rax, low ;               \
+        adoxq   zero, high
+
+S2N_BN_SYMBOL(bignum_sqr_p25519):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Compute [%r15;%r8] = [00] which we use later, but mainly
+// set up an initial window [%r14;...;%r9] = [23;03;01]
+
+        movq    (x), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   8(x), %r9, %r10
+        mulxq   24(x), %r11, %r12
+        movq    16(x), %rdx
+        mulxq   24(x), %r13, %r14
+
+// Clear our zero register, and also initialize the flags for the carry chain
+
+        xorl    zeroe, zeroe
+
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        movq    24(x), %rdx
+        mulpadd(%r13,%r12,8(x))
+        adcxq   zero, %r13
+        adoxq   zero, %r14
+        adcq    zero, %r14
+
+// Double and add to the 00 + 11 + 22 + 33 terms, while also
+// pre-estimating the quotient from early results.
+
+        xorl    zeroe, zeroe
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    8(x), %rdx
+        mulxq   %rdx, %rax, %rcx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rcx, %r11
+        movq    16(x), %rdx
+        mulxq   %rdx, %rax, %rcx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rcx, %r13
+        movq    24(x), %rdx
+        mulxq   %rdx, %rax, %r15
+
+        movl    $38, %edx
+        mulxq   %r15, %rdx, %rcx
+
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   zero, %r15
+        adoxq   zero, %r15
+
+        addq    %r11, %rdx
+        adcq    zero, %rcx
+        shldq   $1, %rdx, %rcx
+        leaq    1(%rcx), %rbx
+        imulq   $19, %rbx
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// and this is == 38 * h + l (mod p_25519)
+// We add in the precalculated 19 * q as well.
+// This is kept in 4 words since we have enough information there.
+
+        xorl    %eax, %eax
+        adoxq   %rbx, %r8
+        movl    $38, %edx
+        mulpadd(%r9,%r8,%r12)
+        mulpadd(%r10,%r9,%r13)
+        mulpadd(%r11,%r10,%r14)
+        mulxq   %r15, %rax, %rcx
+        adcq    %rax, %r11
+
+// We still haven't made the -2^255 * q contribution yet. Since we
+// are now safely in 4 words we just need a single bit of q, and we
+// can actually use the LSB of %rcx = 19 * q since 19 is odd. And we
+// don't literally need to subtract, just to see whether we would
+// have a top 1 bit if we did, meaning we need to correct in the
+// last step by adding 2^255 - 19.
+
+        xorl    %ecx, %ecx
+        shlq    $63, %rbx
+        cmpq    %rbx, %r11
+        movl    $19, %eax
+        cmovns  %rcx, %rax
+
+// Now make that possible correction and finally mask to 255 bits
+
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rcx, %r11
+        btr     $63, %r11
+
+// Write everything back
+
+        movq    %r8, (z)
+        movq    %r9, 8(z)
+        movq    %r10, 16(z)
+        movq    %r11, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519_alt.S
new file mode 100644
index 00000000000..17bef47a1a8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519_alt.S
@@ -0,0 +1,201 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square modulo p_25519, z := (x^2) mod p_25519
+// Input x[4]; output z[4]
+//
+//    extern void bignum_sqr_p25519_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p25519_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Re-use input pointer later for constant
+
+#define d %rsi
+#define c %rcx
+
+#define dshort %esi
+
+// Macro for the key "multiply and add to (c,h,l)" step, for square term
+
+#define combadd1(c,h,l,numa)                    \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa)                       \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+// A version doubling before adding, for non-square terms
+
+#define combadd2(c,h,l,numa,numb)               \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0, c ;                           \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+S2N_BN_SYMBOL(bignum_sqr_p25519_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Result term 0
+
+        movq    (x), %rax
+        mulq    %rax
+
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+
+// Result term 1
+
+       xorq    %r11, %r11
+       combadd2(%r11,%r10,%r9,(x),8(x))
+
+// Result term 2
+
+        xorq    %r12, %r12
+        combadd1(%r12,%r11,%r10,8(x))
+        combadd2(%r12,%r11,%r10,(x),16(x))
+
+// Result term 3
+
+        xorq    %r13, %r13
+        combadd2(%r13,%r12,%r11,(x),24(x))
+        combadd2(%r13,%r12,%r11,8(x),16(x))
+
+// Result term 4
+
+        xorq    %r14, %r14
+        combadd2(%r14,%r13,%r12,8(x),24(x))
+        combadd1(%r14,%r13,%r12,16(x))
+
+// Result term 5
+
+        xorq    %r15, %r15
+        combadd2(%r15,%r14,%r13,16(x),24(x))
+
+// Result term 6
+
+        combads(%r15,%r14,24(x))
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// and this is == 38 * h + l (mod p_25519)
+
+        movl    $38, dshort
+
+        movq    %r12, %rax
+        mulq    d
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    c, c
+
+        movq    %r13, %rax
+        mulq    d
+        subq    c, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    c, c
+
+        movq    %r14, %rax
+        mulq    d
+        subq    c, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    c, c
+
+        movq    %r15, %rax
+        mulq    d
+        subq    c, %rdx
+        xorq    c, c
+        addq    %rax, %r11
+        movq    %rdx, %r12
+        adcq    c, %r12
+
+// Now we have reduced to 5 digits, 2^255 * h + l = [%r12,%r11,%r10,%r9,%r8]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+
+        shldq   $1, %r11, %r12
+        leaq    1(%r12), %rax
+        movl    $19, dshort
+        bts     $63, %r11
+        imulq   d, %rax
+        addq    %rax, %r8
+        adcq    c, %r9
+        adcq    c, %r10
+        adcq    c, %r11
+
+// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0]
+// So we correct if CF = 0 by subtracting 19, either way masking to
+// 255 bits, i.e. by effectively adding p_25519 to the "full" answer
+
+        sbbq    %rax, %rax
+        notq    %rax
+        andq    d, %rax
+        subq    %rax, %r8
+        sbbq    c, %r9
+        sbbq    c, %r10
+        sbbq    c, %r11
+        btr     $63, %r11
+
+// Write everything back
+
+        movq    %r8, (z)
+        movq    %r9, 8(z)
+        movq    %r10, 16(z)
+        movq    %r11, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519.S
new file mode 100644
index 00000000000..7762cf69a45
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519.S
@@ -0,0 +1,595 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square root modulo p_25519 = 2^255 - 19
+// Input x[4]; output function return (Legendre symbol) and z[4]
+//
+// extern int64_t bignum_sqrt_p25519(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Given a 4-digit input x, returns a modular square root mod p_25519, i.e.
+// a z such that z^2 == x (mod p_25519), whenever one exists. The square
+// root z is chosen so that its LSB is even (note that p_25519 - z is
+// another square root). The function return is the Legendre/Jacobi symbol
+// (x//p_25519), which indicates whether indeed x has a modular square root
+// and hence whether the result is meaningful:
+//
+//   0: x is divisible by p_25519 and z is the square root 0
+//  +1: x is coprime to p_25519 and z is a square root
+//  -1: x is coprime to p_25519 but not a quadratic residue
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqrt_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqrt_p25519)
+        .text
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define a 0(%rsp)
+#define b (4*N)(%rsp)
+#define s (8*N)(%rsp)
+#define t (12*N)(%rsp)
+#define u (16*N)(%rsp)
+#define res  (20*N)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (22*N)
+
+// Corrupted versions when stack is down 8 more
+
+#define u8 (17*N)(%rsp)
+
+// Syntactic variants to make x86_att version simpler to generate
+
+#define A 0
+#define B (4*N)
+#define S (8*N)
+#define T (12*N)
+#define U (16*N)
+#define U8 (17*N)
+
+S2N_BN_SYMBOL(bignum_sqrt_p25519):
+        _CET_ENDBR
+
+// In this case the Windows form literally makes a subroutine call.
+// This avoids hassle arising from subroutine offsets
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        callq   bignum_sqrt_p25519_standard
+        popq    %rsi
+        popq    %rdi
+        ret
+
+bignum_sqrt_p25519_standard:
+#endif
+
+// Save registers and make room for temporaries
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Save the return pointer for the end so we can overwrite %rdi later
+
+        movq    %rdi, res
+
+// Set up reduced version of the input argument a = x mod p_25519. Then
+// get the candidate square root s = a^{252-2}
+
+        movq    (%rsi), %rdx
+        movq    0x8(%rsi), %rcx
+        movq    0x10(%rsi), %r8
+        movq    0x18(%rsi), %r9
+        movl    $0x1, %eax
+        xorl    %r10d, %r10d
+        bts     $0x3f, %r9
+        adcq    %r10, %rax
+        imulq   $0x13, %rax, %rax
+        addq    %rax, %rdx
+        adcq    %r10, %rcx
+        adcq    %r10, %r8
+        adcq    %r10, %r9
+        movl    $0x13, %eax
+        cmovbq  %r10, %rax
+        subq    %rax, %rdx
+        sbbq    %r10, %rcx
+        sbbq    %r10, %r8
+        sbbq    %r10, %r9
+        btr     $0x3f, %r9
+        movq    %rdx, A(%rsp)
+        movq    %rcx, A+0x8(%rsp)
+        movq    %r8, A+0x10(%rsp)
+        movq    %r9, A+0x18(%rsp)
+
+  // Power 2^2 - 1 = 3
+
+        leaq    T(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    T(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^4 - 1 = 15
+
+        leaq    S(%rsp), %rdi
+        movq    $2, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^5 - 1 = 31
+
+        leaq    S(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^10 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $5, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^20 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $10, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^25 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $5, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^50 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $25, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^100 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $50, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^125 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $25, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^250 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $125, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^251 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+  // Power 2^252 - 2
+
+        leaq    S(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+// s is now one candidate square root. Generate the other one t = s * j_25519
+
+        movq    $0xc4ee1b274a0ea0b0, %rax
+        movq    %rax, T(%rsp)
+        movq    $0x2f431806ad2fe478, %rax
+        movq    %rax, T+8(%rsp)
+        movq    $0x2b4d00993dfbd7a7, %rax
+        movq    %rax, T+16(%rsp)
+        movq    $0x2b8324804fc1df0b, %rax
+        movq    %rax, T+24(%rsp)
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_mul_p25519
+
+// Now multiplex between them according to whether s^2 = a
+
+        leaq    B(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    S(%rsp), %rdx
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        movq    A(%rsp), %rax
+        xorq    B(%rsp), %rax
+        movq    A+8(%rsp), %rbx
+        xorq    B+8(%rsp), %rbx
+        orq     %rbx, %rax
+        movq    A+16(%rsp), %rcx
+        xorq    B+16(%rsp), %rcx
+        movq    A+24(%rsp), %rdx
+        xorq    B+24(%rsp), %rdx
+        orq     %rdx, %rcx
+        orq     %rcx, %rax
+
+        movq    S(%rsp), %rax
+        movq    T(%rsp), %rbx
+        cmovnzq %rbx, %rax
+        movq    S+8(%rsp), %rbx
+        movq    T+8(%rsp), %rcx
+        cmovnzq %rcx, %rbx
+        movq    S+16(%rsp), %rcx
+        movq    T+16(%rsp), %rdx
+        cmovnzq %rdx, %rcx
+        movq    S+24(%rsp), %rbp
+        movq    T+24(%rsp), %rdx
+        cmovnzq %rdx, %rbp
+
+// For definiteness, choose "positive" (LSB=0) square root
+
+        xorl    %edx, %edx
+        leaq    -19(%rdx), %r8
+        leaq    -1(%rdx), %r11
+        movq    %r11, %r9
+        movq    %r11, %r10
+        btr     $63, %r11
+
+        subq    %rax, %r8
+        sbbq    %rbx, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rbp, %r11
+
+        movq    res, %rdx
+        testq   $1, %rax
+        cmovnzq %r8, %rax
+        movq    %rax, (%rdx)
+        cmovnzq %r9, %rbx
+        movq    %rbx, 8(%rdx)
+        cmovnzq %r10, %rcx
+        movq    %rcx, 16(%rdx)
+        cmovnzq %r11, %rbp
+        movq    %rbp, 24(%rdx)
+
+// Determine if it is is indeed a square root and also if a = 0
+// Hence return the Legendre-Jacobi symbol as required.
+
+        leaq    B(%rsp), %rdi
+        movq    $1, %rsi
+        callq   bignum_sqrt_p25519_nsqr_p25519
+
+        movq    A(%rsp), %rax
+        movq    %rax, %rbp
+        xorq    B(%rsp), %rax
+        movq    A+8(%rsp), %rbx
+        orq     %rbx, %rbp
+        xorq    B+8(%rsp), %rbx
+        orq     %rbx, %rax
+        movq    A+16(%rsp), %rcx
+        orq     %rcx, %rbp
+        xorq    B+16(%rsp), %rcx
+        movq    A+24(%rsp), %rdx
+        orq     %rdx, %rbp
+        xorq    B+24(%rsp), %rdx
+        orq     %rdx, %rcx
+        orq     %rcx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+        leaq    1(%rax,%rax,1), %rax
+
+        testq   %rbp, %rbp
+        cmovzq  %rbp, %rax
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+// *************************************************************
+// Local z = x * y
+// *************************************************************
+
+bignum_sqrt_p25519_mul_p25519:
+        movq    %rdx, %rcx
+        xorl    %ebp, %ebp
+        movq    (%rcx), %rdx
+        mulxq   (%rsi), %r8, %r9
+        mulxq   0x8(%rsi), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x10(%rsi), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x18(%rsi), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rbp, %r12
+        xorl    %ebp, %ebp
+        movq    0x8(%rcx), %rdx
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsi), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rbp, %r13
+        adcq    %rbp, %r13
+        xorl    %ebp, %ebp
+        movq    0x10(%rcx), %rdx
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x18(%rsi), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rbp, %r14
+        adcq    %rbp, %r14
+        xorl    %ebp, %ebp
+        movq    0x18(%rcx), %rdx
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsi), %rcx, %r15
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        movl    $0x26, %edx
+        mulxq   %r15, %rax, %rbx
+        adcxq   %rcx, %r14
+        adoxq   %rbp, %r15
+        adcq    %rbp, %r15
+        addq    %r11, %rax
+        adcq    %rbp, %rbx
+        btq     $0x3f, %rax
+        adcq    %rbx, %rbx
+        leaq    0x1(%rbx), %rcx
+        imulq   $0x13, %rcx, %rcx
+        xorl    %ebp, %ebp
+        adoxq   %rcx, %r8
+        mulxq   %r12, %rax, %rbx
+        adcxq   %rax, %r8
+        adoxq   %rbx, %r9
+        mulxq   %r13, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r14, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   %r15, %rax, %rbx
+        adcq    %rax, %r11
+        shlq    $0x3f, %rcx
+        cmpq    %rcx, %r11
+        movl    $0x13, %eax
+        cmovns  %rbp, %rax
+        subq    %rax, %r8
+        sbbq    %rbp, %r9
+        sbbq    %rbp, %r10
+        sbbq    %rbp, %r11
+        btr     $0x3f, %r11
+        movq    %r8, (%rdi)
+        movq    %r9, 0x8(%rdi)
+        movq    %r10, 0x10(%rdi)
+        movq    %r11, 0x18(%rdi)
+        ret
+
+// *************************************************************
+// Local z = 2^n * x
+// *************************************************************
+
+bignum_sqrt_p25519_nsqr_p25519:
+
+// Copy input argument into u
+
+        movq    (%rdx), %rax
+        movq    8(%rdx), %rbx
+        movq    16(%rdx), %rcx
+        movq    24(%rdx), %rdx
+        movq    %rax, U8(%rsp)
+        movq    %rbx, U8+8(%rsp)
+        movq    %rcx, U8+16(%rsp)
+        movq    %rdx, U8+24(%rsp)
+
+// Main squaring loop, accumulating in u consistently  and
+// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38
+
+bignum_sqrt_p25519_loop:
+        movq   U8(%rsp), %rdx
+        mulxq  %rdx, %r8, %r15
+        mulxq  U8+0x8(%rsp), %r9, %r10
+        mulxq  U8+0x18(%rsp), %r11, %r12
+        movq   U8+0x10(%rsp), %rdx
+        mulxq  U8+0x18(%rsp), %r13, %r14
+        xorl   %ebx, %ebx
+        mulxq  U8(%rsp), %rax, %rcx
+        adcxq  %rax, %r10
+        adoxq  %rcx, %r11
+        mulxq  U8+0x8(%rsp), %rax, %rcx
+        adcxq  %rax, %r11
+        adoxq  %rcx, %r12
+        movq   U8+0x18(%rsp), %rdx
+        mulxq  U8+0x8(%rsp), %rax, %rcx
+        adcxq  %rax, %r12
+        adoxq  %rcx, %r13
+        adcxq  %rbx, %r13
+        adoxq  %rbx, %r14
+        adcq   %rbx, %r14
+        xorl   %ebx, %ebx
+        adcxq  %r9, %r9
+        adoxq  %r15, %r9
+        movq   U8+0x8(%rsp), %rdx
+        mulxq  %rdx, %rax, %rdx
+        adcxq  %r10, %r10
+        adoxq  %rax, %r10
+        adcxq  %r11, %r11
+        adoxq  %rdx, %r11
+        movq   U8+0x10(%rsp), %rdx
+        mulxq  %rdx, %rax, %rdx
+        adcxq  %r12, %r12
+        adoxq  %rax, %r12
+        adcxq  %r13, %r13
+        adoxq  %rdx, %r13
+        movq   U8+0x18(%rsp), %rdx
+        mulxq  %rdx, %rax, %r15
+        adcxq  %r14, %r14
+        adoxq  %rax, %r14
+        adcxq  %rbx, %r15
+        adoxq  %rbx, %r15
+        movl   $0x26, %edx
+        xorl   %ebx, %ebx
+        mulxq  %r12, %rax, %rcx
+        adcxq  %rax, %r8
+        adoxq  %rcx, %r9
+        mulxq  %r13, %rax, %rcx
+        adcxq  %rax, %r9
+        adoxq  %rcx, %r10
+        mulxq  %r14, %rax, %rcx
+        adcxq  %rax, %r10
+        adoxq  %rcx, %r11
+        mulxq  %r15, %rax, %r12
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        adcxq  %rbx, %r12
+        shldq  $0x1, %r11, %r12
+        btr    $0x3f, %r11
+        movl   $0x13, %edx
+        imulq  %r12, %rdx
+        addq   %rdx, %r8
+        adcq   %rbx, %r9
+        adcq   %rbx, %r10
+        adcq   %rbx, %r11
+        movq   %r8, U8(%rsp)
+        movq   %r9, U8+0x8(%rsp)
+        movq   %r10, U8+0x10(%rsp)
+        movq   %r11, U8+0x18(%rsp)
+
+// Loop as applicable
+
+        decq    %rsi
+        jnz     bignum_sqrt_p25519_loop
+
+// We know the intermediate result x < 2^256 - 38, and now we do strict
+// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255
+// which is equivalent to a "ns" condition. We just use the results where
+// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them.
+
+        movl    $19, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+
+        cmovns  %r8, %rax
+        cmovns  %r9, %rbx
+        cmovns  %r10, %rcx
+        cmovns  %r11, %rdx
+        btr     $63, %rdx
+        movq    %rax, (%rdi)
+        movq    %rbx, 8(%rdi)
+        movq    %rcx, 16(%rdi)
+        movq    %rdx, 24(%rdi)
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519_alt.S
new file mode 100644
index 00000000000..6721bc8b283
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519_alt.S
@@ -0,0 +1,676 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square root modulo p_25519 = 2^255 - 19
+// Input x[4]; output function return (Legendre symbol) and z[4]
+//
+// extern int64_t bignum_sqrt_p25519_alt(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// Given a 4-digit input x, returns a modular square root mod p_25519, i.e.
+// a z such that z^2 == x (mod p_25519), whenever one exists. The square
+// root z is chosen so that its LSB is even (note that p_25519 - z is
+// another square root). The function return is the Legendre/Jacobi symbol
+// (x//p_25519), which indicates whether indeed x has a modular square root
+// and hence whether the result is meaningful:
+//
+//   0: x is divisible by p_25519 and z is the square root 0
+//  +1: x is coprime to p_25519 and z is a square root
+//  -1: x is coprime to p_25519 but not a quadratic residue
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqrt_p25519_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqrt_p25519_alt)
+        .text
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define a 0(%rsp)
+#define b (4*N)(%rsp)
+#define s (8*N)(%rsp)
+#define t (12*N)(%rsp)
+#define u (16*N)(%rsp)
+#define res  (20*N)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (22*N)
+
+// Corrupted versions when stack is down 8 more
+
+#define u8 (17*N)(%rsp)
+
+// Syntactic variants to make x86_att version simpler to generate
+
+#define A 0
+#define B (4*N)
+#define S (8*N)
+#define T (12*N)
+#define U (16*N)
+#define U8 (17*N)
+
+S2N_BN_SYMBOL(bignum_sqrt_p25519_alt):
+        _CET_ENDBR
+
+// In this case the Windows form literally makes a subroutine call.
+// This avoids hassle arising from subroutine offsets
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        callq   bignum_sqrt_p25519_alt_standard
+        popq    %rsi
+        popq    %rdi
+        ret
+
+bignum_sqrt_p25519_alt_standard:
+#endif
+
+// Save registers and make room for temporaries
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Save the return pointer for the end so we can overwrite %rdi later
+
+        movq    %rdi, res
+
+// Set up reduced version of the input argument a = x mod p_25519. Then
+// get the candidate square root s = a^{252-2}
+
+        movq    (%rsi), %rdx
+        movq    0x8(%rsi), %rcx
+        movq    0x10(%rsi), %r8
+        movq    0x18(%rsi), %r9
+        movl    $0x1, %eax
+        xorl    %r10d, %r10d
+        bts     $0x3f, %r9
+        adcq    %r10, %rax
+        imulq   $0x13, %rax, %rax
+        addq    %rax, %rdx
+        adcq    %r10, %rcx
+        adcq    %r10, %r8
+        adcq    %r10, %r9
+        movl    $0x13, %eax
+        cmovbq  %r10, %rax
+        subq    %rax, %rdx
+        sbbq    %r10, %rcx
+        sbbq    %r10, %r8
+        sbbq    %r10, %r9
+        btr     $0x3f, %r9
+        movq    %rdx, A(%rsp)
+        movq    %rcx, A+0x8(%rsp)
+        movq    %r8, A+0x10(%rsp)
+        movq    %r9, A+0x18(%rsp)
+
+  // Power 2^2 - 1 = 3
+
+        leaq    T(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    T(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^4 - 1 = 15
+
+        leaq    S(%rsp), %rdi
+        movq    $2, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^5 - 1 = 31
+
+        leaq    S(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^10 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $5, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^20 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $10, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^25 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $5, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^50 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $25, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^100 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $50, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^125 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $25, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^250 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $125, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    B(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^251 - 1
+
+        leaq    S(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    B(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    A(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+  // Power 2^252 - 2
+
+        leaq    S(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+// s is now one candidate square root. Generate the other one t = s * j_25519
+
+        movq    $0xc4ee1b274a0ea0b0, %rax
+        movq    %rax, T(%rsp)
+        movq    $0x2f431806ad2fe478, %rax
+        movq    %rax, T+8(%rsp)
+        movq    $0x2b4d00993dfbd7a7, %rax
+        movq    %rax, T+16(%rsp)
+        movq    $0x2b8324804fc1df0b, %rax
+        movq    %rax, T+24(%rsp)
+
+        leaq    T(%rsp), %rdi
+        leaq    S(%rsp), %rsi
+        leaq    T(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_mul_p25519
+
+// Now multiplex between them according to whether s^2 = a
+
+        leaq    B(%rsp), %rdi
+        movq    $1, %rsi
+        leaq    S(%rsp), %rdx
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        movq    A(%rsp), %rax
+        xorq    B(%rsp), %rax
+        movq    A+8(%rsp), %rbx
+        xorq    B+8(%rsp), %rbx
+        orq     %rbx, %rax
+        movq    A+16(%rsp), %rcx
+        xorq    B+16(%rsp), %rcx
+        movq    A+24(%rsp), %rdx
+        xorq    B+24(%rsp), %rdx
+        orq     %rdx, %rcx
+        orq     %rcx, %rax
+
+        movq    S(%rsp), %rax
+        movq    T(%rsp), %rbx
+        cmovnzq %rbx, %rax
+        movq    S+8(%rsp), %rbx
+        movq    T+8(%rsp), %rcx
+        cmovnzq %rcx, %rbx
+        movq    S+16(%rsp), %rcx
+        movq    T+16(%rsp), %rdx
+        cmovnzq %rdx, %rcx
+        movq    S+24(%rsp), %rbp
+        movq    T+24(%rsp), %rdx
+        cmovnzq %rdx, %rbp
+
+// For definiteness, choose "positive" (LSB=0) square root
+
+        xorl    %edx, %edx
+        leaq    -19(%rdx), %r8
+        leaq    -1(%rdx), %r11
+        movq    %r11, %r9
+        movq    %r11, %r10
+        btr     $63, %r11
+
+        subq    %rax, %r8
+        sbbq    %rbx, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rbp, %r11
+
+        movq    res, %rdx
+        testq   $1, %rax
+        cmovnzq %r8, %rax
+        movq    %rax, (%rdx)
+        cmovnzq %r9, %rbx
+        movq    %rbx, 8(%rdx)
+        cmovnzq %r10, %rcx
+        movq    %rcx, 16(%rdx)
+        cmovnzq %r11, %rbp
+        movq    %rbp, 24(%rdx)
+
+// Determine if it is is indeed a square root and also if a = 0
+// Hence return the Legendre-Jacobi symbol as required.
+
+        leaq    B(%rsp), %rdi
+        movq    $1, %rsi
+        callq   bignum_sqrt_p25519_alt_nsqr_p25519
+
+        movq    A(%rsp), %rax
+        movq    %rax, %rbp
+        xorq    B(%rsp), %rax
+        movq    A+8(%rsp), %rbx
+        orq     %rbx, %rbp
+        xorq    B+8(%rsp), %rbx
+        orq     %rbx, %rax
+        movq    A+16(%rsp), %rcx
+        orq     %rcx, %rbp
+        xorq    B+16(%rsp), %rcx
+        movq    A+24(%rsp), %rdx
+        orq     %rdx, %rbp
+        xorq    B+24(%rsp), %rdx
+        orq     %rdx, %rcx
+        orq     %rcx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+        leaq    1(%rax,%rax,1), %rax
+
+        testq   %rbp, %rbp
+        cmovzq  %rbp, %rax
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+// *************************************************************
+// Local z = x * y
+// *************************************************************
+
+bignum_sqrt_p25519_alt_mul_p25519:
+        movq    %rdx, %rcx
+        movq    (%rsi), %rax
+        mulq     (%rcx)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    (%rsi), %rax
+        mulq     0x8(%rcx)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x8(%rsi), %rax
+        mulq     (%rcx)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    $0x0, %r11
+        xorq    %r12, %r12
+        movq    (%rsi), %rax
+        mulq     0x10(%rcx)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x8(%rsi), %rax
+        mulq     0x8(%rcx)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x10(%rsi), %rax
+        mulq     (%rcx)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    (%rsi), %rax
+        mulq     0x18(%rcx)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x8(%rsi), %rax
+        mulq     0x10(%rcx)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x10(%rsi), %rax
+        mulq     0x8(%rcx)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x18(%rsi), %rax
+        mulq     (%rcx)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x8(%rsi), %rax
+        mulq     0x18(%rcx)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rsi), %rax
+        mulq     0x10(%rcx)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x18(%rsi), %rax
+        mulq     0x8(%rcx)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x10(%rsi), %rax
+        mulq     0x18(%rcx)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rsi), %rax
+        mulq     0x10(%rcx)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x18(%rsi), %rax
+        mulq     0x18(%rcx)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movl    $0x26, %esi
+        movq    %r12, %rax
+        mulq    %rsi
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %rcx, %rcx
+        movq    %r13, %rax
+        mulq    %rsi
+        subq    %rcx, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r14, %rax
+        mulq    %rsi
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    %r15, %rax
+        mulq    %rsi
+        subq    %rcx, %rdx
+        xorq    %rcx, %rcx
+        addq    %rax, %r11
+        movq    %rdx, %r12
+        adcq    %rcx, %r12
+        shldq   $0x1, %r11, %r12
+        leaq    0x1(%r12), %rax
+        movl    $0x13, %esi
+        bts     $0x3f, %r11
+        imulq   %rsi, %rax
+        addq    %rax, %r8
+        adcq    %rcx, %r9
+        adcq    %rcx, %r10
+        adcq    %rcx, %r11
+        sbbq    %rax, %rax
+        notq    %rax
+        andq    %rsi, %rax
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rcx, %r11
+        btr     $0x3f, %r11
+        movq    %r8, (%rdi)
+        movq    %r9, 0x8(%rdi)
+        movq    %r10, 0x10(%rdi)
+        movq    %r11, 0x18(%rdi)
+        ret
+
+// *************************************************************
+// Local z = 2^n * x
+// *************************************************************
+
+bignum_sqrt_p25519_alt_nsqr_p25519:
+
+// Copy input argument into u
+
+        movq    (%rdx), %rax
+        movq    8(%rdx), %rbx
+        movq    16(%rdx), %rcx
+        movq    24(%rdx), %rdx
+        movq    %rax, U8(%rsp)
+        movq    %rbx, U8+8(%rsp)
+        movq    %rcx, U8+16(%rsp)
+        movq    %rdx, U8+24(%rsp)
+
+// Main squaring loop, accumulating in u consistently  and
+// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38
+
+bignum_sqrt_p25519_alt_loop:
+        movq    U8(%rsp), %rax
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    U8(%rsp), %rax
+        mulq     U8+0x8(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r11
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    $0x0, %r11
+        xorq    %r12, %r12
+        movq    U8+0x8(%rsp), %rax
+        mulq    %rax
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    U8(%rsp), %rax
+        mulq     U8+0x10(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r12
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    U8(%rsp), %rax
+        mulq     U8+0x18(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r13
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    U8+0x8(%rsp), %rax
+        mulq     U8+0x10(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r13
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    U8+0x8(%rsp), %rax
+        mulq     U8+0x18(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r14
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    U8+0x10(%rsp), %rax
+        mulq    %rax
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    U8+0x10(%rsp), %rax
+        mulq     U8+0x18(%rsp)
+        addq    %rax, %rax
+        adcq    %rdx, %rdx
+        adcq    $0x0, %r15
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    U8+0x18(%rsp), %rax
+        mulq    %rax
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movl    $0x26, %ebx
+        movq    %r12, %rax
+        mulq    %rbx
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %rcx, %rcx
+        movq    %r13, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r14, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    %r15, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        xorq    %rcx, %rcx
+        addq    %rax, %r11
+        movq    %rdx, %r12
+        adcq    %rcx, %r12
+        shldq   $0x1, %r11, %r12
+        btr     $0x3f, %r11
+        movl    $0x13, %edx
+        imulq   %r12, %rdx
+        addq    %rdx, %r8
+        adcq    %rcx, %r9
+        adcq    %rcx, %r10
+        adcq    %rcx, %r11
+        movq    %r8, U8(%rsp)
+        movq    %r9, U8+0x8(%rsp)
+        movq    %r10, U8+0x10(%rsp)
+        movq    %r11, U8+0x18(%rsp)
+
+// Loop as applicable
+
+        decq    %rsi
+        jnz     bignum_sqrt_p25519_alt_loop
+
+// We know the intermediate result x < 2^256 - 38, and now we do strict
+// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255
+// which is equivalent to a "ns" condition. We just use the results where
+// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them.
+
+        movl    $19, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+
+        cmovns  %r8, %rax
+        cmovns  %r9, %rbx
+        cmovns  %r10, %rcx
+        cmovns  %r11, %rdx
+        btr     $63, %rdx
+        movq    %rax, (%rdi)
+        movq    %rbx, 8(%rdi)
+        movq    %rcx, 16(%rdi)
+        movq    %rdx, 24(%rdi)
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sub_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sub_p25519.S
new file mode 100644
index 00000000000..ecfd00a930d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sub_p25519.S
@@ -0,0 +1,85 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo p_25519, z := (x - y) mod p_25519
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_sub_p25519
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p25519)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p25519)
+        .text
+
+#define z %rdi
+#define x %rsi
+#define y %rdx
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+#define zero %rax
+#define zeroe %eax
+#define c %rcx
+#define cshort %ecx
+
+S2N_BN_SYMBOL(bignum_sub_p25519):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Load and subtract the two inputs as [d3;d2;d1;d0] = x - y (modulo 2^256)
+
+        movq    (x), d0
+        subq    (y), d0
+        movq    8(x), d1
+        sbbq    8(y), d1
+        movq    16(x), d2
+        sbbq    16(y), d2
+        movq    24(x), d3
+        sbbq    24(y), d3
+
+// Now if x < y we want to add back p_25519, which staying within 4 digits
+// means subtracting 19, since p_25519 = 2^255 - 19.
+// Let c be that constant 19 when x < y, zero otherwise.
+
+        sbbq    c, c
+        xorl    zeroe, zeroe
+        andq    $19, c
+
+// Correct by adding the optional constant and masking to 255 bits
+
+        subq    c, d0
+        movq    d0, (z)
+        sbbq    zero, d1
+        movq    d1, 8(z)
+        sbbq    zero, d2
+        movq    d2, 16(z)
+        sbbq    zero, d3
+        btr     $63, d3
+        movq    d3, 24(z)
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep.S
new file mode 100644
index 00000000000..cc05e86bdaa
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep.S
@@ -0,0 +1,743 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery ladder step on pairs of (X,Z)-projective curve25519 points
+//
+// extern void curve25519_ladderstep
+//   (uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b)
+//
+// If point = (X,1) and pp = (n * (X,1),[n+1] * (X,1)) then the output
+// rr = (n' * (X,1),[n'+1] * (X,1)) where n' = 2 * n + b, with input
+// b assumed to be 0 or 1; in this setting, each pair (X,Z) is assumed to
+// be a projective y-free representation of an affine curve25519 point
+// (X/Z,y), with the initial "differential" point having Z = 1 and X its
+// affine x coordinate. In other words, the ladderstep operation is a
+// combination of doubling, differential addition and optional swapping.
+//
+// Standard x86-64 ABI: RDI = rr, RSI = point, RDX = pp, RCX = b
+// Microsoft x64 ABI:   RCX = rr, RDX = point, R8 = pp, R9 = b
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_ladderstep)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_ladderstep)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// The single field of the input point used (z assumed 1)
+
+#define point_x 0(%rbp)
+
+// Pointer-offset pairs for pp fields
+// These use the initial register %rdx as the offset.
+// We then never need it again so it can be ephemeral
+
+#define xn 0(%rdx)
+#define zn NUMSIZE(%rdx)
+#define xm (2*NUMSIZE)(%rdx)
+#define zm (3*NUMSIZE)(%rdx)
+
+// Result fields
+
+#define res0 0(%rbp)
+#define res1 NUMSIZE(%rbp)
+#define res2 (2*NUMSIZE)(%rbp)
+#define res3 (3*NUMSIZE)(%rbp)
+
+// Pointer-offset pairs for temporaries on stack
+// dmsn and dnsm need space for >= 5 digits, and we allocate 8
+
+#define sm (0*NUMSIZE)(%rsp)
+#define sn (1*NUMSIZE)(%rsp)
+#define dm (2*NUMSIZE)(%rsp)
+#define dn (3*NUMSIZE)(%rsp)
+#define dmsn (4*NUMSIZE)(%rsp)
+#define dnsm (6*NUMSIZE)(%rsp)
+#define s (8*NUMSIZE)(%rsp)
+#define d (9*NUMSIZE)(%rsp)
+#define p (10*NUMSIZE)(%rsp)
+
+// Preserved inputs
+
+#define rr (12*NUMSIZE)(%rsp)
+#define point (12*NUMSIZE)+8(%rsp)
+#define pp (12*NUMSIZE)+16(%rsp)
+#define bb  (12*NUMSIZE)+24(%rsp)
+
+// More, but aliases to above
+
+#define sumx sm
+#define sumz sn
+#define dubx dm
+#define dubz dn
+#define e dubz
+#define spro dnsm
+#define dpro sumz
+
+// Total size to reserve on the stack
+
+#define NSPACE (13*NUMSIZE)
+
+// Macros wrapping up the basic field operation calls
+// bignum_mul_p25519 and bignum_sqr_p25519.
+// These two are only trivially different from pure
+// function calls to those subroutines.
+
+#define mul_p25519(P0,P1,P2)                    \
+        xorl   %edi, %edi ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rdi, %r12 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rdi, %r13 ;                        \
+        adcxq  %rdi, %r13 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rdi, %r14 ;                        \
+        adcxq  %rdi, %r14 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rdi, %r15 ;                        \
+        adcxq  %rdi, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %edi, %edi ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rdi, %r12 ;                        \
+        adcxq  %rdi, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        movl   $0x13, %edx ;                       \
+        incq   %r12;                             \
+        bts    $63, %r11 ;                         \
+        mulxq  %r12, %rax, %rbx ;                   \
+        addq   %rax, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rdi, %r10 ;                        \
+        adcq   %rdi, %r11 ;                        \
+        sbbq   %rax, %rax ;                        \
+        notq   %rax;                             \
+        andq   %rdx, %rax ;                        \
+        subq   %rax, %r8 ;                         \
+        sbbq   %rdi, %r9 ;                         \
+        sbbq   %rdi, %r10 ;                        \
+        sbbq   %rdi, %r11 ;                        \
+        btr    $63, %r11 ;                         \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+#define sqr_p25519(P0,P1)                       \
+        movq   P1, %rdx ;                       \
+        mulxq  %rdx, %r8, %r15 ;                    \
+        mulxq  0x8+P1, %r9, %r10 ;               \
+        mulxq  0x18+P1, %r11, %r12 ;             \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  0x18+P1, %r13, %r14 ;             \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  P1, %rax, %rcx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rcx, %r12 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rcx, %r13 ;                        \
+        adcxq  %rbx, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        adcq   %rbx, %r14 ;                        \
+        xorl   %ebx, %ebx ;                        \
+        adcxq  %r9, %r9 ;                          \
+        adoxq  %r15, %r9 ;                         \
+        movq   0x8+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r10, %r10 ;                        \
+        adoxq  %rax, %r10 ;                        \
+        adcxq  %r11, %r11 ;                        \
+        adoxq  %rdx, %r11 ;                        \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r12, %r12 ;                        \
+        adoxq  %rax, %r12 ;                        \
+        adcxq  %r13, %r13 ;                        \
+        adoxq  %rdx, %r13 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %r15 ;                   \
+        adcxq  %r14, %r14 ;                        \
+        adoxq  %rax, %r14 ;                        \
+        adcxq  %rbx, %r15 ;                        \
+        adoxq  %rbx, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  %r12, %rax, %rcx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rcx, %r9 ;                         \
+        mulxq  %r13, %rax, %rcx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rcx, %r10 ;                        \
+        mulxq  %r14, %rax, %rcx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        adcxq  %rbx, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        movl   $0x13, %edx ;                       \
+        leaq   0x1(%r12), %rax ;                  \
+        bts    $0x3f, %r11 ;                       \
+        imulq  %rdx, %rax ;                        \
+        addq   %rax, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        cmovbq %rbx, %rdx ;                        \
+        subq   %rdx, %r8 ;                         \
+        sbbq   %rbx, %r9 ;                         \
+        sbbq   %rbx, %r10 ;                        \
+        sbbq   %rbx, %r11 ;                        \
+        btr    $0x3f, %r11 ;                       \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Multiplication just giving a 5-digit result (actually < 39 * p_25519)
+// by not doing anything beyond the first stage of reduction
+
+#define mul_5(P0,P1,P2)                         \
+        xorl   %edi, %edi ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rdi, %r12 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rdi, %r13 ;                        \
+        adcxq  %rdi, %r13 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rdi, %r14 ;                        \
+        adcxq  %rdi, %r14 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rdi, %r15 ;                        \
+        adcxq  %rdi, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %edi, %edi ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rdi, %r12 ;                        \
+        adcxq  %rdi, %r12 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0 ;                  \
+        movq   %r12, 0x20+P0
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        movq   P1, %rdx ;                       \
+        mulxq  %rdx, %r8, %r15 ;                    \
+        mulxq  0x8+P1, %r9, %r10 ;               \
+        mulxq  0x18+P1, %r11, %r12 ;             \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  0x18+P1, %r13, %r14 ;             \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  P1, %rax, %rcx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rcx, %r12 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rcx, %r13 ;                        \
+        adcxq  %rbx, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        adcq   %rbx, %r14 ;                        \
+        xorl   %ebx, %ebx ;                        \
+        adcxq  %r9, %r9 ;                          \
+        adoxq  %r15, %r9 ;                         \
+        movq   0x8+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r10, %r10 ;                        \
+        adoxq  %rax, %r10 ;                        \
+        adcxq  %r11, %r11 ;                        \
+        adoxq  %rdx, %r11 ;                        \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r12, %r12 ;                        \
+        adoxq  %rax, %r12 ;                        \
+        adcxq  %r13, %r13 ;                        \
+        adoxq  %rdx, %r13 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %r15 ;                   \
+        adcxq  %r14, %r14 ;                        \
+        adoxq  %rax, %r14 ;                        \
+        adcxq  %rbx, %r15 ;                        \
+        adoxq  %rbx, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  %r12, %rax, %rcx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rcx, %r9 ;                         \
+        mulxq  %r13, %rax, %rcx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rcx, %r10 ;                        \
+        mulxq  %r14, %rax, %rcx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        adcxq  %rbx, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+// Add 5-digit inputs and normalize to 4 digits
+
+#define add5_4(P0,P1,P2)                        \
+        movq    P1, %r8 ;                       \
+        addq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        adcq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        adcq    16+P2, %r10 ;                   \
+        movq    24+P1, %r11 ;                   \
+        adcq    24+P2, %r11 ;                   \
+        movq    32+P1, %r12 ;                   \
+        adcq    32+P2, %r12 ;                   \
+        xorl    %ebx, %ebx ;                       \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        subq    $19, %r8 ;                         \
+        movq    %r8, P0 ;                       \
+        sbbq    $0, %r9 ;                          \
+        movq    %r9, 8+P0 ;                     \
+        sbbq    $0, %r10 ;                         \
+        movq    %r10, 16+P0 ;                   \
+        sbbq    $0, %rax ;                         \
+        btc     $63, %rax ;                        \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// 5-digit subtraction with upward bias to make it positive, adding
+// 1000 * (2^255 - 19) = 2^256 * 500 - 19000, then normalizing to 4 digits
+
+#define sub5_4(P0,P1,P2)                        \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %r11 ;                   \
+        sbbq    24+P2, %r11 ;                   \
+        movq    32+P1, %r12 ;                   \
+        sbbq    32+P2, %r12 ;                   \
+        xorl    %ebx, %ebx ;                       \
+        subq    $19000, %r8 ;                      \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        sbbq    %rbx, %r12 ;                       \
+        addq    $500, %r12 ;                       \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Combined z = c * x + y with reduction only < 2 * p_25519
+// It is assumed that 19 * (c * x + y) < 2^60 * 2^256 so we
+// don't need a high mul in the final part.
+
+#define cmadd_4(P0,C1,P2,P3)                    \
+        movq    P3, %r8 ;                       \
+        movq    8+P3, %r9 ;                     \
+        movq    16+P3, %r10 ;                   \
+        movq    24+P3, %r11 ;                   \
+        xorl    %edi, %edi ;                       \
+        movq    $C1, %rdx ;                        \
+        mulxq   P2, %rax, %rbx ;                 \
+        adcxq   %rax, %r8 ;                        \
+        adoxq   %rbx, %r9 ;                        \
+        mulxq   8+P2, %rax, %rbx ;               \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rbx, %r10 ;                       \
+        mulxq   16+P2, %rax, %rbx ;              \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   24+P2, %rax, %rbx ;              \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rdi, %rbx ;                       \
+        adcxq   %rdi, %rbx ;                       \
+        shldq   $0x1, %r11, %rbx ;                  \
+        btr     $63, %r11 ;                        \
+        movl    $0x13, %edx ;                      \
+        imulq   %rdx, %rbx ;                       \
+        addq    %rbx, %r8 ;                        \
+        adcq    %rdi, %r9 ;                        \
+        adcq    %rdi, %r10 ;                       \
+        adcq    %rdi, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Multiplex: z := if NZ then x else y
+
+#define mux_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        movq    P2, %rcx ;                      \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        movq    8+P2, %rcx ;                    \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        movq    16+P2, %rcx ;                   \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        movq    24+P2, %rcx ;                   \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 24+P0
+
+// Paired multiplex: (w,z) := if NZ then (y,x) else (x,y)
+
+#define muxpair_4(P0,P1,P2,P3)                  \
+        movq    P2, %rax ;                      \
+        movq    P3, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        cmovnzq %rcx, %rax ;                       \
+        cmovnzq %rdx, %rcx ;                       \
+        movq    %rax, P0 ;                      \
+        movq    %rcx, P1 ;                      \
+        movq    8+P2, %rax ;                    \
+        movq    8+P3, %rcx ;                    \
+        movq    %rax, %rdx ;                       \
+        cmovnzq %rcx, %rax ;                       \
+        cmovnzq %rdx, %rcx ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    %rcx, 8+P1 ;                    \
+        movq    16+P2, %rax ;                   \
+        movq    16+P3, %rcx ;                   \
+        movq    %rax, %rdx ;                       \
+        cmovnzq %rcx, %rax ;                       \
+        cmovnzq %rdx, %rcx ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    %rcx, 16+P1 ;                   \
+        movq    24+P2, %rax ;                   \
+        movq    24+P3, %rcx ;                   \
+        movq    %rax, %rdx ;                       \
+        cmovnzq %rcx, %rax ;                       \
+        cmovnzq %rdx, %rcx ;                       \
+        movq    %rax, 24+P0 ;                   \
+        movq    %rcx, 24+P1
+
+S2N_BN_SYMBOL(curve25519_ladderstep):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdi, rr
+        movq    %rsi, point
+        movq    %rcx, bb
+
+// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn
+// The adds don't need any normalization as they're fed to muls
+// Just make sure the subs fit in 4 digits. Keep pp in %rdx
+// here, after which we can forget about it.
+
+        sub_4(dm,xm,zm)
+        add_4(sn,xn,zn)
+        sub_4(dn,xn,zn)
+        add_4(sm,xm,zm)
+
+// ADDING: dmsn = dm * sn; dnsm = sm * dn
+// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt)
+
+        mul_5(dmsn,dm,sn)
+
+        movq    bb, %rax
+        testq   %rax, %rax
+        mux_4(d,dm,dn)
+        mux_4(s,sm,sn)
+
+        mul_5(dnsm,sm,dn)
+
+// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits
+
+        sqr_4(d,d)
+
+// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2
+// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits
+
+        sub5_4(dpro,dmsn,dnsm)
+        sqr_4(s,s)
+        add5_4(spro,dmsn,dnsm)
+        sqr_4(dpro,dpro)
+
+// DOUBLING: p = 4 * xt * zt = s - d
+
+        sub_twice4(p,s,d)
+
+// ADDING: sumx = (dmsn + dnsm)^2
+
+        sqr_p25519(sumx,spro)
+
+// DOUBLING: e = 121666 * p + d
+
+        cmadd_4(e,0x1db42,p,d)
+
+// DOUBLING: dubx = (xt + zt)^2 * (xt - zt)^2 = s * d
+
+        mul_p25519(dubx,s,d)
+
+// ADDING: sumz = x * (dmsn - dnsm)^2
+
+        movq    point, %rbp
+        mul_p25519(sumz,dpro,point_x)
+
+// DOUBLING: dubz = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt))
+//                = p * (d + 121666 * p)
+
+        mul_p25519(dubz,p,e)
+
+// Multiplex the outputs
+
+        movq    bb, %rax
+        movq    rr, %rbp
+        testq   %rax, %rax
+        muxpair_4(res0,res2,dubx,sumx)
+        muxpair_4(res1,res3,dubz,sumz)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep_alt.S
new file mode 100644
index 00000000000..5ca8e9997d4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep_alt.S
@@ -0,0 +1,909 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery ladder step on pairs of (X,Z)-projective curve25519 points
+//
+// extern void curve25519_ladderstep_alt
+//   (uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b)
+//
+// If point = (X,1) and pp = (n * (X,1),[n+1] * (X,1)) then the output
+// rr = (n' * (X,1),[n'+1] * (X,1)) where n' = 2 * n + b, with input
+// b assumed to be 0 or 1; in this setting, each pair (X,Z) is assumed to
+// be a projective y-free representation of an affine curve25519 point
+// (X/Z,y), with the initial "differential" point having Z = 1 and X its
+// affine x coordinate. In other words, the ladderstep operation is a
+// combination of doubling, differential addition and optional swapping.
+//
+// Standard x86-64 ABI: RDI = rr, RSI = point, RDX = pp, RCX = b
+// Microsoft x64 ABI:   RCX = rr, RDX = point, R8 = pp, R9 = b
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_ladderstep_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_ladderstep_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// The single field of the input point used (z assumed 1)
+
+#define point_x 0(%rbp)
+
+// Pointer-offset pairs for pp fields
+// These use the initial register %rdx as the offset.
+// We then never need it again so it can be ephemeral
+
+#define xn 0(%rdx)
+#define zn NUMSIZE(%rdx)
+#define xm (2*NUMSIZE)(%rdx)
+#define zm (3*NUMSIZE)(%rdx)
+
+// Result fields
+
+#define res0 0(%rbp)
+#define res1 NUMSIZE(%rbp)
+#define res2 (2*NUMSIZE)(%rbp)
+#define res3 (3*NUMSIZE)(%rbp)
+
+// Pointer-offset pairs for temporaries on stack
+// dmsn and dnsm need space for >= 5 digits, and we allocate 8
+
+#define sm (0*NUMSIZE)(%rsp)
+#define sn (1*NUMSIZE)(%rsp)
+#define dm (2*NUMSIZE)(%rsp)
+#define dn (3*NUMSIZE)(%rsp)
+#define dmsn (4*NUMSIZE)(%rsp)
+#define dnsm (6*NUMSIZE)(%rsp)
+#define s (8*NUMSIZE)(%rsp)
+#define d (9*NUMSIZE)(%rsp)
+#define p (10*NUMSIZE)(%rsp)
+
+// Preserved inputs
+
+#define rr (12*NUMSIZE)(%rsp)
+#define point (12*NUMSIZE)+8(%rsp)
+#define pp (12*NUMSIZE)+16(%rsp)
+#define bb  (12*NUMSIZE)+24(%rsp)
+
+// More, but aliases to above
+
+#define sumx sm
+#define sumz sn
+#define dubx dm
+#define dubz dn
+#define e dubz
+#define spro dnsm
+#define dpro sumz
+
+// Total size to reserve on the stack
+
+#define NSPACE (13*NUMSIZE)
+
+// Macros wrapping up the basic field operation calls
+// bignum_mul_p25519_alt and bignum_sqr_p25519_alt.
+// These two are only trivially different from pure
+// function calls to those subroutines.
+
+#define mul_p25519(P0,P1,P2)                    \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        leaq    0x1(%r12), %rax ;                  \
+        movl    $0x13, %esi ;                       \
+        bts     $63, %r11 ;                         \
+        imulq   %rsi, %rax ;                        \
+        addq    %rax, %r8 ;                         \
+        adcq    %rcx, %r9 ;                         \
+        adcq    %rcx, %r10 ;                        \
+        adcq    %rcx, %r11 ;                        \
+        sbbq    %rax, %rax ;                        \
+        notq    %rax;                            \
+        andq    %rsi, %rax ;                        \
+        subq    %rax, %r8 ;                         \
+        sbbq    %rcx, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        sbbq    %rcx, %r11 ;                        \
+        btr     $63, %r11 ;                         \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+#define sqr_p25519(P0,P1)                       \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r11 ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r12 ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r14 ;                        \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r15 ;                        \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        leaq    0x1(%r12), %rax ;                  \
+        movl    $0x13, %esi ;                       \
+        bts     $63, %r11 ;                         \
+        imulq   %rsi, %rax ;                        \
+        addq    %rax, %r8 ;                         \
+        adcq    %rcx, %r9 ;                         \
+        adcq    %rcx, %r10 ;                        \
+        adcq    %rcx, %r11 ;                        \
+        sbbq    %rax, %rax ;                        \
+        notq    %rax;                            \
+        andq    %rsi, %rax ;                        \
+        subq    %rax, %r8 ;                         \
+        sbbq    %rcx, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        sbbq    %rcx, %r11 ;                        \
+        btr     $63, %r11 ;                         \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+// Multiplication just giving a 5-digit result (actually < 39 * p_25519)
+// by not doing anything beyond the first stage of reduction
+
+#define mul_5(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0 ;                 \
+        movq    %r12, 0x20+P0
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r11 ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r12 ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r14 ;                        \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r15 ;                        \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                  \
+        btr     $0x3f, %r11 ;                      \
+        movl    $0x13, %edx ;                      \
+        imulq   %r12, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+// Add 5-digit inputs and normalize to 4 digits
+
+#define add5_4(P0,P1,P2)                        \
+        movq    P1, %r8 ;                       \
+        addq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        adcq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        adcq    16+P2, %r10 ;                   \
+        movq    24+P1, %r11 ;                   \
+        adcq    24+P2, %r11 ;                   \
+        movq    32+P1, %r12 ;                   \
+        adcq    32+P2, %r12 ;                   \
+        xorl    %ebx, %ebx ;                       \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        subq    $19, %r8 ;                         \
+        movq    %r8, P0 ;                       \
+        sbbq    $0, %r9 ;                          \
+        movq    %r9, 8+P0 ;                     \
+        sbbq    $0, %r10 ;                         \
+        movq    %r10, 16+P0 ;                   \
+        sbbq    $0, %rax ;                         \
+        btc     $63, %rax ;                        \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// 5-digit subtraction with upward bias to make it positive, adding
+// 1000 * (2^255 - 19) = 2^256 * 500 - 19000, then normalizing to 4 digits
+
+#define sub5_4(P0,P1,P2)                        \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %r11 ;                   \
+        sbbq    24+P2, %r11 ;                   \
+        movq    32+P1, %r12 ;                   \
+        sbbq    32+P2, %r12 ;                   \
+        xorl    %ebx, %ebx ;                       \
+        subq    $19000, %r8 ;                      \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        sbbq    %rbx, %r12 ;                       \
+        addq    $500, %r12 ;                       \
+        shldq   $0x1, %r11, %r12 ;                  \
+        btr     $0x3f, %r11 ;                      \
+        movl    $0x13, %edx ;                      \
+        imulq   %r12, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rbx, %r9 ;                        \
+        adcq    %rbx, %r10 ;                       \
+        adcq    %rbx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Combined z = c * x + y with reduction only < 2 * p_25519
+// It is assumed that 19 * (c * x + y) < 2^60 * 2^256 so we
+// don't need a high mul in the final part.
+
+#define cmadd_4(P0,C1,P2,P3)                    \
+        movq    $C1, %rsi ;                         \
+        movq    P2, %rax ;                       \
+        mulq    %rsi;                            \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        movq    0x8+P2, %rax ;                   \
+        xorq    %r10, %r10 ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x10+P2, %rax ;                  \
+        mulq    %rsi;                            \
+        addq    %rax, %r10 ;                        \
+        adcq    $0x0, %rdx ;                        \
+        movq    0x18+P2, %rax ;                  \
+        movq    %rdx, %r11 ;                        \
+        mulq    %rsi;                            \
+        xorl    %esi, %esi ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rsi, %rdx ;                        \
+        addq    P3, %r8 ;                        \
+        adcq    0x8+P3, %r9 ;                    \
+        adcq    0x10+P3, %r10 ;                  \
+        adcq    0x18+P3, %r11 ;                  \
+        adcq    %rsi, %rdx ;                        \
+        shldq   $0x1, %r11, %rdx ;                  \
+        btr     $63, %r11 ;                        \
+        movl    $0x13, %ebx ;                      \
+        imulq   %rbx, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rsi, %r9 ;                        \
+        adcq    %rsi, %r10 ;                       \
+        adcq    %rsi, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Multiplex: z := if NZ then x else y
+
+#define mux_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        movq    P2, %rcx ;                      \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        movq    8+P2, %rcx ;                    \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        movq    16+P2, %rcx ;                   \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        movq    24+P2, %rcx ;                   \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 24+P0
+
+// Paired multiplex: (w,z) := if NZ then (y,x) else (x,y)
+
+#define muxpair_4(P0,P1,P2,P3)                  \
+        movq    P2, %rax ;                      \
+        movq    P3, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        cmovnzq %rcx, %rax ;                       \
+        cmovnzq %rdx, %rcx ;                       \
+        movq    %rax, P0 ;                      \
+        movq    %rcx, P1 ;                      \
+        movq    8+P2, %rax ;                    \
+        movq    8+P3, %rcx ;                    \
+        movq    %rax, %rdx ;                       \
+        cmovnzq %rcx, %rax ;                       \
+        cmovnzq %rdx, %rcx ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    %rcx, 8+P1 ;                    \
+        movq    16+P2, %rax ;                   \
+        movq    16+P3, %rcx ;                   \
+        movq    %rax, %rdx ;                       \
+        cmovnzq %rcx, %rax ;                       \
+        cmovnzq %rdx, %rcx ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    %rcx, 16+P1 ;                   \
+        movq    24+P2, %rax ;                   \
+        movq    24+P3, %rcx ;                   \
+        movq    %rax, %rdx ;                       \
+        cmovnzq %rcx, %rax ;                       \
+        cmovnzq %rdx, %rcx ;                       \
+        movq    %rax, 24+P0 ;                   \
+        movq    %rcx, 24+P1
+
+S2N_BN_SYMBOL(curve25519_ladderstep_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdi, rr
+        movq    %rsi, point
+        movq    %rcx, bb
+
+// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn
+// The adds don't need any normalization as they're fed to muls
+// Just make sure the subs fit in 4 digits. Keep pp in %rdx
+// here, after which we can forget about it.
+
+        sub_4(dm,xm,zm)
+        add_4(sn,xn,zn)
+        sub_4(dn,xn,zn)
+        add_4(sm,xm,zm)
+
+// ADDING: dmsn = dm * sn; dnsm = sm * dn
+// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt)
+
+        mul_5(dmsn,dm,sn)
+
+        movq    bb, %rax
+        testq   %rax, %rax
+        mux_4(d,dm,dn)
+        mux_4(s,sm,sn)
+
+        mul_5(dnsm,sm,dn)
+
+// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits
+
+        sqr_4(d,d)
+
+// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2
+// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits
+
+        sub5_4(dpro,dmsn,dnsm)
+        sqr_4(s,s)
+        add5_4(spro,dmsn,dnsm)
+        sqr_4(dpro,dpro)
+
+// DOUBLING: p = 4 * xt * zt = s - d
+
+        sub_twice4(p,s,d)
+
+// ADDING: sumx = (dmsn + dnsm)^2
+
+        sqr_p25519(sumx,spro)
+
+// DOUBLING: e = 121666 * p + d
+
+        cmadd_4(e,0x1db42,p,d)
+
+// DOUBLING: dubx = (xt + zt)^2 * (xt - zt)^2 = s * d
+
+        mul_p25519(dubx,s,d)
+
+// ADDING: sumz = x * (dmsn - dnsm)^2
+
+        movq    point, %rbp
+        mul_p25519(sumz,dpro,point_x)
+
+// DOUBLING: dubz = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt))
+//                = p * (d + 121666 * p)
+
+        mul_p25519(dubz,p,e)
+
+// Multiplex the outputs
+
+        movq    bb, %rax
+        movq    rr, %rbp
+        testq   %rax, %rax
+        muxpair_4(res0,res2,dubx,sumx)
+        muxpair_4(res1,res3,dubz,sumz)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul.S
new file mode 100644
index 00000000000..33b41f8ed68
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul.S
@@ -0,0 +1,771 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Projective scalar multiplication, x coordinate only, for curve25519
+// Inputs scalar[4], point[4]; output res[8]
+//
+// extern void curve25519_pxscalarmul
+//   (uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4])
+//
+// Given the X coordinate of an input point = (X,Y) on curve25519, which
+// could also be part of a projective representation (X,Y,1) of the same
+// point, returns a projective representation (X,Z) = scalar * point, where
+// scalar is a 256-bit number. The corresponding affine form is (X/Z,Y'),
+// X/Z meaning division modulo 2^255-19, and Y' not being computed by
+// this function (nor is any Y coordinate of the input point used).
+//
+// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point
+// Microsoft x64 ABI:   RCX = res, RDX = scalar, R8 = point
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_pxscalarmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_pxscalarmul)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+// and additional registers for loop counter and swap flag
+
+#define res  10*NUMSIZE(%rsp)
+#define point  10*NUMSIZE+8(%rsp)
+#define scalar  10*NUMSIZE+16(%rsp)
+#define i  10*NUMSIZE+24(%rsp)
+#define swap  10*NUMSIZE+32(%rsp)
+
+// Pointers to input x coord (we don't use y or z) and output coords.
+// These all assume the base address (point and res respectively) is
+// currently in the %rbp register.
+
+#define x 0(%rbp)
+#define resx 0(%rbp)
+#define resz NUMSIZE(%rbp)
+
+// Pointer-offset pairs for temporaries on stack with some aliasing.
+// Both dmsn and dnsm need space for >= 5 digits, and we allocate 8
+
+#define zm (0*NUMSIZE)(%rsp)
+#define sm (0*NUMSIZE)(%rsp)
+#define dpro (0*NUMSIZE)(%rsp)
+
+#define sn (1*NUMSIZE)(%rsp)
+
+#define dm (2*NUMSIZE)(%rsp)
+
+#define zn (3*NUMSIZE)(%rsp)
+#define dn (3*NUMSIZE)(%rsp)
+#define e (3*NUMSIZE)(%rsp)
+
+#define dmsn (4*NUMSIZE)(%rsp)
+#define p (4*NUMSIZE)(%rsp)
+
+#define xm (6*NUMSIZE)(%rsp)
+#define dnsm (6*NUMSIZE)(%rsp)
+#define spro (6*NUMSIZE)(%rsp)
+
+#define xn (8*NUMSIZE)(%rsp)
+#define s (8*NUMSIZE)(%rsp)
+
+#define d (9*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+// This includes space for the 5 other variables above
+
+#define NSPACE (10*NUMSIZE+40)
+
+// Macros wrapping up the basic field operation calls
+// bignum_mul_p25519 and bignum_sqr_p25519.
+// These two are only trivially different from pure
+// function calls to those subroutines.
+
+#define mul_p25519(P0,P1,P2)                    \
+        xorl   %edi, %edi ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rdi, %r12 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rdi, %r13 ;                        \
+        adcxq  %rdi, %r13 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rdi, %r14 ;                        \
+        adcxq  %rdi, %r14 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rdi, %r15 ;                        \
+        adcxq  %rdi, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %edi, %edi ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rdi, %r12 ;                        \
+        adcxq  %rdi, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        movl   $0x13, %edx ;                       \
+        incq   %r12;                             \
+        bts    $63, %r11 ;                         \
+        mulxq  %r12, %rax, %rbx ;                   \
+        addq   %rax, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rdi, %r10 ;                        \
+        adcq   %rdi, %r11 ;                        \
+        sbbq   %rax, %rax ;                        \
+        notq   %rax;                             \
+        andq   %rdx, %rax ;                        \
+        subq   %rax, %r8 ;                         \
+        sbbq   %rdi, %r9 ;                         \
+        sbbq   %rdi, %r10 ;                        \
+        sbbq   %rdi, %r11 ;                        \
+        btr    $63, %r11 ;                         \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+#define sqr_p25519(P0,P1)                       \
+        movq   P1, %rdx ;                       \
+        mulxq  %rdx, %r8, %r15 ;                    \
+        mulxq  0x8+P1, %r9, %r10 ;               \
+        mulxq  0x18+P1, %r11, %r12 ;             \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  0x18+P1, %r13, %r14 ;             \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  P1, %rax, %rcx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rcx, %r12 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rcx, %r13 ;                        \
+        adcxq  %rbx, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        adcq   %rbx, %r14 ;                        \
+        xorl   %ebx, %ebx ;                        \
+        adcxq  %r9, %r9 ;                          \
+        adoxq  %r15, %r9 ;                         \
+        movq   0x8+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r10, %r10 ;                        \
+        adoxq  %rax, %r10 ;                        \
+        adcxq  %r11, %r11 ;                        \
+        adoxq  %rdx, %r11 ;                        \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r12, %r12 ;                        \
+        adoxq  %rax, %r12 ;                        \
+        adcxq  %r13, %r13 ;                        \
+        adoxq  %rdx, %r13 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %r15 ;                   \
+        adcxq  %r14, %r14 ;                        \
+        adoxq  %rax, %r14 ;                        \
+        adcxq  %rbx, %r15 ;                        \
+        adoxq  %rbx, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  %r12, %rax, %rcx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rcx, %r9 ;                         \
+        mulxq  %r13, %rax, %rcx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rcx, %r10 ;                        \
+        mulxq  %r14, %rax, %rcx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        adcxq  %rbx, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        movl   $0x13, %edx ;                       \
+        leaq   0x1(%r12), %rax ;                  \
+        bts    $0x3f, %r11 ;                       \
+        imulq  %rdx, %rax ;                        \
+        addq   %rax, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        cmovbq %rbx, %rdx ;                        \
+        subq   %rdx, %r8 ;                         \
+        sbbq   %rbx, %r9 ;                         \
+        sbbq   %rbx, %r10 ;                        \
+        sbbq   %rbx, %r11 ;                        \
+        btr    $0x3f, %r11 ;                       \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Multiplication just giving a 5-digit result (actually < 39 * p_25519)
+// by not doing anything beyond the first stage of reduction
+
+#define mul_5(P0,P1,P2)                         \
+        xorl   %edi, %edi ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rdi, %r12 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rdi, %r13 ;                        \
+        adcxq  %rdi, %r13 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rdi, %r14 ;                        \
+        adcxq  %rdi, %r14 ;                        \
+        xorl   %edi, %edi ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rdi, %r15 ;                        \
+        adcxq  %rdi, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %edi, %edi ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rdi, %r12 ;                        \
+        adcxq  %rdi, %r12 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0 ;                  \
+        movq   %r12, 0x20+P0
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        movq   P1, %rdx ;                       \
+        mulxq  %rdx, %r8, %r15 ;                    \
+        mulxq  0x8+P1, %r9, %r10 ;               \
+        mulxq  0x18+P1, %r11, %r12 ;             \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  0x18+P1, %r13, %r14 ;             \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  P1, %rax, %rcx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rcx, %r12 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rcx, %r13 ;                        \
+        adcxq  %rbx, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        adcq   %rbx, %r14 ;                        \
+        xorl   %ebx, %ebx ;                        \
+        adcxq  %r9, %r9 ;                          \
+        adoxq  %r15, %r9 ;                         \
+        movq   0x8+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r10, %r10 ;                        \
+        adoxq  %rax, %r10 ;                        \
+        adcxq  %r11, %r11 ;                        \
+        adoxq  %rdx, %r11 ;                        \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r12, %r12 ;                        \
+        adoxq  %rax, %r12 ;                        \
+        adcxq  %r13, %r13 ;                        \
+        adoxq  %rdx, %r13 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %r15 ;                   \
+        adcxq  %r14, %r14 ;                        \
+        adoxq  %rax, %r14 ;                        \
+        adcxq  %rbx, %r15 ;                        \
+        adoxq  %rbx, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  %r12, %rax, %rcx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rcx, %r9 ;                         \
+        mulxq  %r13, %rax, %rcx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rcx, %r10 ;                        \
+        mulxq  %r14, %rax, %rcx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        adcxq  %rbx, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+// Add 5-digit inputs and normalize to 4 digits
+
+#define add5_4(P0,P1,P2)                        \
+        movq    P1, %r8 ;                       \
+        addq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        adcq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        adcq    16+P2, %r10 ;                   \
+        movq    24+P1, %r11 ;                   \
+        adcq    24+P2, %r11 ;                   \
+        movq    32+P1, %r12 ;                   \
+        adcq    32+P2, %r12 ;                   \
+        xorl    %ebx, %ebx ;                       \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        subq    $19, %r8 ;                         \
+        movq    %r8, P0 ;                       \
+        sbbq    $0, %r9 ;                          \
+        movq    %r9, 8+P0 ;                     \
+        sbbq    $0, %r10 ;                         \
+        movq    %r10, 16+P0 ;                   \
+        sbbq    $0, %rax ;                         \
+        btc     $63, %rax ;                        \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// 5-digit subtraction with upward bias to make it positive, adding
+// 1000 * (2^255 - 19) = 2^256 * 500 - 19000, then normalizing to 4 digits
+
+#define sub5_4(P0,P1,P2)                        \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %r11 ;                   \
+        sbbq    24+P2, %r11 ;                   \
+        movq    32+P1, %r12 ;                   \
+        sbbq    32+P2, %r12 ;                   \
+        xorl    %ebx, %ebx ;                       \
+        subq    $19000, %r8 ;                      \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        sbbq    %rbx, %r12 ;                       \
+        addq    $500, %r12 ;                       \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Combined z = c * x + y with reduction only < 2 * p_25519
+// It is assumed that 19 * (c * x + y) < 2^60 * 2^256 so we
+// don't need a high mul in the final part.
+
+#define cmadd_4(P0,C1,P2,P3)                    \
+        movq    P3, %r8 ;                       \
+        movq    8+P3, %r9 ;                     \
+        movq    16+P3, %r10 ;                   \
+        movq    24+P3, %r11 ;                   \
+        xorl    %edi, %edi ;                       \
+        movq    $C1, %rdx ;                        \
+        mulxq   P2, %rax, %rbx ;                 \
+        adcxq   %rax, %r8 ;                        \
+        adoxq   %rbx, %r9 ;                        \
+        mulxq   8+P2, %rax, %rbx ;               \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rbx, %r10 ;                       \
+        mulxq   16+P2, %rax, %rbx ;              \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   24+P2, %rax, %rbx ;              \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rdi, %rbx ;                       \
+        adcxq   %rdi, %rbx ;                       \
+        shldq   $0x1, %r11, %rbx ;                  \
+        btr     $63, %r11 ;                        \
+        movl    $0x13, %edx ;                      \
+        imulq   %rdx, %rbx ;                       \
+        addq    %rbx, %r8 ;                        \
+        adcq    %rdi, %r9 ;                        \
+        adcq    %rdi, %r10 ;                       \
+        adcq    %rdi, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Multiplex: z := if NZ then x else y
+
+#define mux_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        movq    P2, %rcx ;                      \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        movq    8+P2, %rcx ;                    \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        movq    16+P2, %rcx ;                   \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        movq    24+P2, %rcx ;                   \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 24+P0
+
+S2N_BN_SYMBOL(curve25519_pxscalarmul):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $NSPACE, %rsp
+
+// Move the input arguments to stable places
+
+        movq    %rdi, res
+        movq    %rsi, scalar
+        movq    %rdx, point
+
+// Initialize (xn,zn) = (1,0) and (xm,zm) = (x,1) with swap = 0
+
+        movq    $1, %rax
+        movq    %rax, 256(%rsp)
+        movq    %rax, (%rsp)
+        xorl    %eax, %eax
+        movq    %rax, swap
+        movq    %rax, 96(%rsp)
+        movq    %rax, 264(%rsp)
+        movq    %rax, 8(%rsp)
+        movq    %rax, 104(%rsp)
+        movq    %rax, 272(%rsp)
+        movq    %rax, 16(%rsp)
+        movq    %rax, 112(%rsp)
+        movq    %rax, 280(%rsp)
+        movq    %rax, 24(%rsp)
+        movq    %rax, 120(%rsp)
+        movq    (%rdx), %rax
+        movq    %rax, 192(%rsp)
+        movq    8(%rdx), %rax
+        movq    %rax, 200(%rsp)
+        movq    16(%rdx), %rax
+        movq    %rax, 208(%rsp)
+        movq    24(%rdx), %rax
+        movq    %rax, 216(%rsp)
+
+// The outer loop from i = 255, ..., i = 0 (inclusive)
+
+        movl    $255, %eax
+        movq    %rax, i
+
+curve25519_pxscalarmul_loop:
+
+// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn
+// The adds don't need any normalization as they're fed to muls
+// Just make sure the subs fit in 4 digits.
+
+        sub_4(dm,xm,zm)
+        add_4(sn,xn,zn)
+        sub_4(dn,xn,zn)
+        add_4(sm,xm,zm)
+
+// ADDING: dmsn = dm * sn; dnsm = sm * dn
+// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt)
+
+        mul_5(dmsn,sn,dm)
+
+        movq    scalar, %rax
+        movq    i, %rdx
+        movq    %rdx, %rcx
+        shrq    $6, %rdx
+        movq    (%rax,%rdx,8), %rdx
+        shrq    %cl, %rdx
+        andq    $1, %rdx
+        cmpq    swap, %rdx
+        movq    %rdx, swap
+
+        mux_4(d,dm,dn)
+        mux_4(s,sm,sn)
+
+        mul_5(dnsm,sm,dn)
+
+// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits
+
+        sqr_4(d,d)
+
+// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2
+// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits
+
+        sub5_4(dpro,dmsn,dnsm)
+        sqr_4(s,s)
+        add5_4(spro,dmsn,dnsm)
+        sqr_4(dpro,dpro)
+
+// DOUBLING: p = 4 * xt * zt = s - d
+
+        sub_twice4(p,s,d)
+
+// ADDING: xm' = (dmsn + dnsm)^2
+
+        sqr_p25519(xm,spro)
+
+// DOUBLING: e = 121666 * p + d
+
+        cmadd_4(e,0x1db42,p,d)
+
+// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d
+
+        mul_p25519(xn,s,d)
+
+// ADDING: zm' = x * (dmsn - dnsm)^2
+
+        movq    point, %rbp
+        mul_p25519(zm,dpro,x)
+
+// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt))
+//               = p * (d + 121666 * p)
+
+        mul_p25519(zn,p,e)
+
+// Loop down as far as 0 (inclusive)
+
+        movq    i, %rax
+        subq    $1, %rax
+        movq    %rax, i
+        jnc     curve25519_pxscalarmul_loop
+
+// The main loop does not handle the special input of the 2-torsion
+// point = (0,0). In that case we may get a spurious (0,0) as output
+// when we want (0,1) [for odd scalar] or (1,0) [for even scalar].
+// Test if x = 0 (this is equivalent for curve25519 to y = 0) and if
+// so, patch zm = 1 [for odd multiple], xn = 1 [for even multiple].
+
+        movl    $1, %ecx
+        movq    point, %rbp
+        movq    (%rbp), %rax
+        orq     8(%rbp), %rax
+        orq     16(%rbp), %rax
+        orq     24(%rbp), %rax
+        cmovnzq %rcx, %rax
+        xorq    $1, %rax
+        orq     %rax, (%rsp)
+        orq     %rax, 256(%rsp)
+
+// Multiplex into the final outputs
+
+        movq    res, %rbp
+        movq    swap, %rax
+        testq   %rax, %rax
+
+        mux_4(resx,xm,xn)
+        mux_4(resz,zm,zn)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul_alt.S
new file mode 100644
index 00000000000..65f896ddd81
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul_alt.S
@@ -0,0 +1,937 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Projective scalar multiplication, x coordinate only, for curve25519
+// Inputs scalar[4], point[4]; output res[8]
+//
+// extern void curve25519_pxscalarmul_alt
+//   (uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4])
+//
+// Given the X coordinate of an input point = (X,Y) on curve25519, which
+// could also be part of a projective representation (X,Y,1) of the same
+// point, returns a projective representation (X,Z) = scalar * point, where
+// scalar is a 256-bit number. The corresponding affine form is (X/Z,Y'),
+// X/Z meaning division modulo 2^255-19, and Y' not being computed by
+// this function (nor is any Y coordinate of the input point used).
+//
+// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point
+// Microsoft x64 ABI:   RCX = res, RDX = scalar, R8 = point
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_pxscalarmul_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_pxscalarmul_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Stable homes for input arguments during main code sequence
+// and additional registers for loop counter and swap flag
+
+#define res  10*NUMSIZE(%rsp)
+#define point  10*NUMSIZE+8(%rsp)
+#define scalar  10*NUMSIZE+16(%rsp)
+#define i  10*NUMSIZE+24(%rsp)
+#define swap  10*NUMSIZE+32(%rsp)
+
+// Pointers to input x coord (we don't use y or z) and output coords.
+// These all assume the base address (point and res respectively) is
+// currently in the %rbp register.
+
+#define x 0(%rbp)
+#define resx 0(%rbp)
+#define resz NUMSIZE(%rbp)
+
+// Pointer-offset pairs for temporaries on stack with some aliasing.
+// Both dmsn and dnsm need space for >= 5 digits, and we allocate 8
+
+#define zm (0*NUMSIZE)(%rsp)
+#define sm (0*NUMSIZE)(%rsp)
+#define dpro (0*NUMSIZE)(%rsp)
+
+#define sn (1*NUMSIZE)(%rsp)
+
+#define dm (2*NUMSIZE)(%rsp)
+
+#define zn (3*NUMSIZE)(%rsp)
+#define dn (3*NUMSIZE)(%rsp)
+#define e (3*NUMSIZE)(%rsp)
+
+#define dmsn (4*NUMSIZE)(%rsp)
+#define p (4*NUMSIZE)(%rsp)
+
+#define xm (6*NUMSIZE)(%rsp)
+#define dnsm (6*NUMSIZE)(%rsp)
+#define spro (6*NUMSIZE)(%rsp)
+
+#define xn (8*NUMSIZE)(%rsp)
+#define s (8*NUMSIZE)(%rsp)
+
+#define d (9*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+// This includes space for the 5 other variables above
+
+#define NSPACE (10*NUMSIZE+40)
+
+// Macros wrapping up the basic field operation calls
+// bignum_mul_p25519_alt and bignum_sqr_p25519_alt.
+// These two are only trivially different from pure
+// function calls to those subroutines.
+
+#define mul_p25519(P0,P1,P2)                    \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        leaq    0x1(%r12), %rax ;                  \
+        movl    $0x13, %esi ;                       \
+        bts     $63, %r11 ;                         \
+        imulq   %rsi, %rax ;                        \
+        addq    %rax, %r8 ;                         \
+        adcq    %rcx, %r9 ;                         \
+        adcq    %rcx, %r10 ;                        \
+        adcq    %rcx, %r11 ;                        \
+        sbbq    %rax, %rax ;                        \
+        notq    %rax;                            \
+        andq    %rsi, %rax ;                        \
+        subq    %rax, %r8 ;                         \
+        sbbq    %rcx, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        sbbq    %rcx, %r11 ;                        \
+        btr     $63, %r11 ;                         \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+#define sqr_p25519(P0,P1)                       \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r11 ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r12 ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r14 ;                        \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r15 ;                        \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        leaq    0x1(%r12), %rax ;                  \
+        movl    $0x13, %esi ;                       \
+        bts     $63, %r11 ;                         \
+        imulq   %rsi, %rax ;                        \
+        addq    %rax, %r8 ;                         \
+        adcq    %rcx, %r9 ;                         \
+        adcq    %rcx, %r10 ;                        \
+        adcq    %rcx, %r11 ;                        \
+        sbbq    %rax, %rax ;                        \
+        notq    %rax;                            \
+        andq    %rsi, %rax ;                        \
+        subq    %rax, %r8 ;                         \
+        sbbq    %rcx, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        sbbq    %rcx, %r11 ;                        \
+        btr     $63, %r11 ;                         \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+// Multiplication just giving a 5-digit result (actually < 39 * p_25519)
+// by not doing anything beyond the first stage of reduction
+
+#define mul_5(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0 ;                 \
+        movq    %r12, 0x20+P0
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r11 ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r12 ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r14 ;                        \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r15 ;                        \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                  \
+        btr     $0x3f, %r11 ;                      \
+        movl    $0x13, %edx ;                      \
+        imulq   %r12, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+// Add 5-digit inputs and normalize to 4 digits
+
+#define add5_4(P0,P1,P2)                        \
+        movq    P1, %r8 ;                       \
+        addq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        adcq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        adcq    16+P2, %r10 ;                   \
+        movq    24+P1, %r11 ;                   \
+        adcq    24+P2, %r11 ;                   \
+        movq    32+P1, %r12 ;                   \
+        adcq    32+P2, %r12 ;                   \
+        xorl    %ebx, %ebx ;                       \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        subq    $19, %r8 ;                         \
+        movq    %r8, P0 ;                       \
+        sbbq    $0, %r9 ;                          \
+        movq    %r9, 8+P0 ;                     \
+        sbbq    $0, %r10 ;                         \
+        movq    %r10, 16+P0 ;                   \
+        sbbq    $0, %rax ;                         \
+        btc     $63, %rax ;                        \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// 5-digit subtraction with upward bias to make it positive, adding
+// 1000 * (2^255 - 19) = 2^256 * 500 - 19000, then normalizing to 4 digits
+
+#define sub5_4(P0,P1,P2)                        \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %r11 ;                   \
+        sbbq    24+P2, %r11 ;                   \
+        movq    32+P1, %r12 ;                   \
+        sbbq    32+P2, %r12 ;                   \
+        xorl    %ebx, %ebx ;                       \
+        subq    $19000, %r8 ;                      \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        sbbq    %rbx, %r12 ;                       \
+        addq    $500, %r12 ;                       \
+        shldq   $0x1, %r11, %r12 ;                  \
+        btr     $0x3f, %r11 ;                      \
+        movl    $0x13, %edx ;                      \
+        imulq   %r12, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rbx, %r9 ;                        \
+        adcq    %rbx, %r10 ;                       \
+        adcq    %rbx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Combined z = c * x + y with reduction only < 2 * p_25519
+// It is assumed that 19 * (c * x + y) < 2^60 * 2^256 so we
+// don't need a high mul in the final part.
+
+#define cmadd_4(P0,C1,P2,P3)                    \
+        movq    $C1, %rsi ;                         \
+        movq    P2, %rax ;                       \
+        mulq    %rsi;                            \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        movq    0x8+P2, %rax ;                   \
+        xorq    %r10, %r10 ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x10+P2, %rax ;                  \
+        mulq    %rsi;                            \
+        addq    %rax, %r10 ;                        \
+        adcq    $0x0, %rdx ;                        \
+        movq    0x18+P2, %rax ;                  \
+        movq    %rdx, %r11 ;                        \
+        mulq    %rsi;                            \
+        xorl    %esi, %esi ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rsi, %rdx ;                        \
+        addq    P3, %r8 ;                        \
+        adcq    0x8+P3, %r9 ;                    \
+        adcq    0x10+P3, %r10 ;                  \
+        adcq    0x18+P3, %r11 ;                  \
+        adcq    %rsi, %rdx ;                        \
+        shldq   $0x1, %r11, %rdx ;                  \
+        btr     $63, %r11 ;                        \
+        movl    $0x13, %ebx ;                      \
+        imulq   %rbx, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rsi, %r9 ;                        \
+        adcq    %rsi, %r10 ;                       \
+        adcq    %rsi, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Multiplex: z := if NZ then x else y
+
+#define mux_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        movq    P2, %rcx ;                      \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        movq    8+P2, %rcx ;                    \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        movq    16+P2, %rcx ;                   \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        movq    24+P2, %rcx ;                   \
+        cmovzq  %rcx, %rax ;                       \
+        movq    %rax, 24+P0
+
+S2N_BN_SYMBOL(curve25519_pxscalarmul_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $NSPACE, %rsp
+
+// Move the input arguments to stable places
+
+        movq    %rdi, res
+        movq    %rsi, scalar
+        movq    %rdx, point
+
+// Initialize (xn,zn) = (1,0) and (xm,zm) = (x,1) with swap = 0
+
+        movq    $1, %rax
+        movq    %rax, 256(%rsp)
+        movq    %rax, (%rsp)
+        xorl    %eax, %eax
+        movq    %rax, swap
+        movq    %rax, 96(%rsp)
+        movq    %rax, 264(%rsp)
+        movq    %rax, 8(%rsp)
+        movq    %rax, 104(%rsp)
+        movq    %rax, 272(%rsp)
+        movq    %rax, 16(%rsp)
+        movq    %rax, 112(%rsp)
+        movq    %rax, 280(%rsp)
+        movq    %rax, 24(%rsp)
+        movq    %rax, 120(%rsp)
+        movq    (%rdx), %rax
+        movq    %rax, 192(%rsp)
+        movq    8(%rdx), %rax
+        movq    %rax, 200(%rsp)
+        movq    16(%rdx), %rax
+        movq    %rax, 208(%rsp)
+        movq    24(%rdx), %rax
+        movq    %rax, 216(%rsp)
+
+// The outer loop from i = 255, ..., i = 0 (inclusive)
+
+        movl    $255, %eax
+        movq    %rax, i
+
+curve25519_pxscalarmul_alt_loop:
+
+// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn
+// The adds don't need any normalization as they're fed to muls
+// Just make sure the subs fit in 4 digits.
+
+        sub_4(dm,xm,zm)
+        add_4(sn,xn,zn)
+        sub_4(dn,xn,zn)
+        add_4(sm,xm,zm)
+
+// ADDING: dmsn = dm * sn; dnsm = sm * dn
+// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt)
+
+        mul_5(dmsn,sn,dm)
+
+        movq    scalar, %rax
+        movq    i, %rdx
+        movq    %rdx, %rcx
+        shrq    $6, %rdx
+        movq    (%rax,%rdx,8), %rdx
+        shrq    %cl, %rdx
+        andq    $1, %rdx
+        cmpq    swap, %rdx
+        movq    %rdx, swap
+
+        mux_4(d,dm,dn)
+        mux_4(s,sm,sn)
+
+        mul_5(dnsm,sm,dn)
+
+// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits
+
+        sqr_4(d,d)
+
+// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2
+// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits
+
+        sub5_4(dpro,dmsn,dnsm)
+        sqr_4(s,s)
+        add5_4(spro,dmsn,dnsm)
+        sqr_4(dpro,dpro)
+
+// DOUBLING: p = 4 * xt * zt = s - d
+
+        sub_twice4(p,s,d)
+
+// ADDING: xm' = (dmsn + dnsm)^2
+
+        sqr_p25519(xm,spro)
+
+// DOUBLING: e = 121666 * p + d
+
+        cmadd_4(e,0x1db42,p,d)
+
+// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d
+
+        mul_p25519(xn,s,d)
+
+// ADDING: zm' = x * (dmsn - dnsm)^2
+
+        movq    point, %rbp
+        mul_p25519(zm,dpro,x)
+
+// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt))
+//               = p * (d + 121666 * p)
+
+        mul_p25519(zn,p,e)
+
+// Loop down as far as 0 (inclusive)
+
+        movq    i, %rax
+        subq    $1, %rax
+        movq    %rax, i
+        jnc     curve25519_pxscalarmul_alt_loop
+
+// The main loop does not handle the special input of the 2-torsion
+// point = (0,0). In that case we may get a spurious (0,0) as output
+// when we want (0,1) [for odd scalar] or (1,0) [for even scalar].
+// Test if x = 0 (this is equivalent for curve25519 to y = 0) and if
+// so, patch zm = 1 [for odd multiple], xn = 1 [for even multiple].
+
+        movl    $1, %ecx
+        movq    point, %rbp
+        movq    (%rbp), %rax
+        orq     8(%rbp), %rax
+        orq     16(%rbp), %rax
+        orq     24(%rbp), %rax
+        cmovnzq %rcx, %rax
+        xorq    $1, %rax
+        orq     %rax, (%rsp)
+        orq     %rax, 256(%rsp)
+
+// Multiplex into the final outputs
+
+        movq    res, %rbp
+        movq    swap, %rax
+        testq   %rax, %rax
+
+        mux_4(resx,xm,xn)
+        mux_4(resz,zm,zn)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519.S
index b9f7cdaa163..db8d3767374 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519.S
@@ -595,6 +595,7 @@
 
 S2N_BN_SYMBOL(curve25519_x25519):
 S2N_BN_SYMBOL(curve25519_x25519_byte):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519_alt.S
index f7c6c3d7b02..88c29f1ec0a 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519_alt.S
@@ -756,6 +756,7 @@
 
 S2N_BN_SYMBOL(curve25519_x25519_alt):
 S2N_BN_SYMBOL(curve25519_x25519_byte_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base.S
index dda3b1707b6..eb7e509aa61 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base.S
@@ -338,6 +338,7 @@
 
 S2N_BN_SYMBOL(curve25519_x25519base):
 S2N_BN_SYMBOL(curve25519_x25519base_byte):
+        _CET_ENDBR
 
 // In this case the Windows form literally makes a subroutine call.
 // This avoids hassle arising from keeping code and data together.
diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base_alt.S
index b6c82faba0c..34ee779a183 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base_alt.S
@@ -414,6 +414,7 @@
 
 S2N_BN_SYMBOL(curve25519_x25519base_alt):
 S2N_BN_SYMBOL(curve25519_x25519base_byte_alt):
+        _CET_ENDBR
 
 // In this case the Windows form literally makes a subroutine call.
 // This avoids hassle arising from keeping code and data together.
diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode.S
index ae63e0dacba..25cc51bc3ac 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode.S
@@ -69,6 +69,7 @@
 #define Q8 (25*N)
 
 S2N_BN_SYMBOL(edwards25519_decode):
+        _CET_ENDBR
 
 // In this case the Windows form literally makes a subroutine call.
 // This avoids hassle arising from subroutine offsets
diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode_alt.S
index 8bfe721253a..bd5fae468d0 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode_alt.S
@@ -69,6 +69,7 @@
 #define Q8 (25*N)
 
 S2N_BN_SYMBOL(edwards25519_decode_alt):
+        _CET_ENDBR
 
 // In this case the Windows form literally makes a subroutine call.
 // This avoids hassle arising from subroutine offsets
diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_encode.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_encode.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_encode.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_encode.S
index 13b0102d098..dc05eb2d45d 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_encode.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_encode.S
@@ -41,6 +41,7 @@
 #define xb %r9
 
 S2N_BN_SYMBOL(edwards25519_encode):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd.S
new file mode 100644
index 00000000000..02b0504aaad
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd.S
@@ -0,0 +1,436 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective addition for edwards25519
+// Inputs p1[16], p2[16]; output p3[16]
+//
+// extern void edwards25519_epadd
+//   (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16])
+//
+// The output p3 and both inputs p1 and p2 are points (x,y) on
+// edwards25519 represented in extended projective quadruples (X,Y,Z,T)
+// where x = X / Z, y = Y / Z and x * y = T / Z.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epadd)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Registers used for inputs and outputs within basic operations.
+// Here p1 and p3 are where the parameters come in anyway;
+// the p2 = %rbp assignment is set up at the beginning.
+
+#define p3 %rdi
+#define p1 %rsi
+#define p2 %rbp
+
+// Pointers to input and output coordinates
+
+#define x_1 0(p1)
+#define y_1 NUMSIZE(p1)
+#define z_1 (2*NUMSIZE)(p1)
+#define w_1 (3*NUMSIZE)(p1)
+
+#define x_2 0(p2)
+#define y_2 NUMSIZE(p2)
+#define z_2 (2*NUMSIZE)(p2)
+#define w_2 (3*NUMSIZE)(p2)
+
+#define x_3 0(p3)
+#define y_3 NUMSIZE(p3)
+#define z_3 (2*NUMSIZE)(p3)
+#define w_3 (3*NUMSIZE)(p3)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 (0*NUMSIZE)(%rsp)
+#define t1 (1*NUMSIZE)(%rsp)
+#define t2 (2*NUMSIZE)(%rsp)
+#define t3 (3*NUMSIZE)(%rsp)
+#define t4 (4*NUMSIZE)(%rsp)
+#define t5 (5*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (6*NUMSIZE)
+
+// Macro wrapping up the basic field multiplication, only trivially
+// different from a pure function call to bignum_mul_p25519.
+
+#define mul_p25519(P0,P1,P2)                    \
+        xorl   %esi, %esi ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rsi, %r12 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rsi, %r13 ;                        \
+        adcxq  %rsi, %r13 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rsi, %r14 ;                        \
+        adcxq  %rsi, %r14 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rsi, %r15 ;                        \
+        adcxq  %rsi, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %esi, %esi ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rsi, %r12 ;                        \
+        adcxq  %rsi, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        movl   $0x13, %edx ;                       \
+        incq   %r12;                             \
+        bts    $63, %r11 ;                         \
+        mulxq  %r12, %rax, %rbx ;                   \
+        addq   %rax, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rsi, %r10 ;                        \
+        adcq   %rsi, %r11 ;                        \
+        sbbq   %rax, %rax ;                        \
+        notq   %rax;                             \
+        andq   %rdx, %rax ;                        \
+        subq   %rax, %r8 ;                         \
+        sbbq   %rsi, %r9 ;                         \
+        sbbq   %rsi, %r10 ;                        \
+        sbbq   %rsi, %r11 ;                        \
+        btr    $63, %r11 ;                         \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        xorl   %ecx, %ecx ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rcx, %r12 ;                        \
+        xorl   %ecx, %ecx ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rcx, %r13 ;                        \
+        adcxq  %rcx, %r13 ;                        \
+        xorl   %ecx, %ecx ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rcx, %r14 ;                        \
+        adcxq  %rcx, %r14 ;                        \
+        xorl   %ecx, %ecx ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rcx, %r15 ;                        \
+        adcxq  %rcx, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %ecx, %ecx ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rcx, %r12 ;                        \
+        adcxq  %rcx, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rcx, %r9 ;                         \
+        adcq   %rcx, %r10 ;                        \
+        adcq   %rcx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Plain 4-digit add and doubling without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+#define double_4(P0,P1)                         \
+        movq    P1, %rax ;                      \
+        addq    %rax, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 24+P0
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        subq    $19, %r8 ;                         \
+        movq    %r8, P0 ;                       \
+        sbbq    $0, %r9 ;                          \
+        movq    %r9, 8+P0 ;                     \
+        sbbq    $0, %r10 ;                         \
+        movq    %r10, 16+P0 ;                   \
+        sbbq    $0, %rax ;                         \
+        btc     $63, %rax ;                        \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38
+// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519.
+
+#define add_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    P2, %r8 ;                       \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    0x8+P2, %r9 ;                   \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    0x10+P2, %r10 ;                 \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    0x18+P2, %r11 ;                 \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Load the constant k_25519 = 2 * d_25519 using immediate operations
+
+#define load_k25519(P0)                         \
+        movq    $0xebd69b9426b2f159, %rax ;        \
+        movq    %rax, P0 ;                      \
+        movq    $0x00e0149a8283b156, %rax ;        \
+        movq    %rax, 8+P0 ;                    \
+        movq    $0x198e80f2eef3d130, %rax ;        \
+        movq    %rax, 16+P0 ;                   \
+        movq    $0x2406d9dc56dffce7, %rax ;        \
+        movq    %rax, 24+P0
+
+S2N_BN_SYMBOL(edwards25519_epadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence of operations. after setting up p2 in its register
+
+        movq    %rdx, p2
+
+        mul_4(t0,w_1,w_2)
+
+        sub_4(t1,y_1,x_1)
+        sub_4(t2,y_2,x_2)
+        add_4(t3,y_1,x_1)
+        add_4(t4,y_2,x_2)
+        double_4(t5,z_2)
+
+        mul_4(t1,t1,t2)
+        mul_4(t3,t3,t4)
+
+        load_k25519(t2)
+        mul_4(t2,t2,t0)
+
+        mul_4(t4,z_1,t5)
+
+        sub_twice4(t0,t3,t1)
+        add_twice4(t5,t3,t1)
+        sub_twice4(t1,t4,t2)
+        add_twice4(t3,t4,t2)
+
+        mul_p25519(w_3,t0,t5)
+        mul_p25519(x_3,t0,t1)
+        mul_p25519(y_3,t3,t5)
+        mul_p25519(z_3,t1,t3)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd_alt.S
new file mode 100644
index 00000000000..3da55cafb62
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd_alt.S
@@ -0,0 +1,512 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective addition for edwards25519
+// Inputs p1[16], p2[16]; output p3[16]
+//
+// extern void edwards25519_epadd_alt
+//   (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16])
+//
+// The output p3 and both inputs p1 and p2 are points (x,y) on
+// edwards25519 represented in extended projective quadruples (X,Y,Z,T)
+// where x = X / Z, y = Y / Z and x * y = T / Z.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epadd_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Registers used for inputs and outputs within basic operations.
+// Here p1 and p3 are where the parameters come in anyway;
+// the p2 = %rbp assignment is set up at the beginning.
+
+#define p3 %rdi
+#define p1 %rsi
+#define p2 %rbp
+
+// Pointers to input and output coordinates
+
+#define x_1 0(p1)
+#define y_1 NUMSIZE(p1)
+#define z_1 (2*NUMSIZE)(p1)
+#define w_1 (3*NUMSIZE)(p1)
+
+#define x_2 0(p2)
+#define y_2 NUMSIZE(p2)
+#define z_2 (2*NUMSIZE)(p2)
+#define w_2 (3*NUMSIZE)(p2)
+
+#define x_3 0(p3)
+#define y_3 NUMSIZE(p3)
+#define z_3 (2*NUMSIZE)(p3)
+#define w_3 (3*NUMSIZE)(p3)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 (0*NUMSIZE)(%rsp)
+#define t1 (1*NUMSIZE)(%rsp)
+#define t2 (2*NUMSIZE)(%rsp)
+#define t3 (3*NUMSIZE)(%rsp)
+#define t4 (4*NUMSIZE)(%rsp)
+#define t5 (5*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (6*NUMSIZE)
+
+// Macro wrapping up the basic field multiplication, only trivially
+// different from a pure function call to bignum_mul_p25519_alt.
+
+#define mul_p25519(P0,P1,P2)                    \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        leaq    0x1(%r12), %rax ;                  \
+        movl    $0x13, %esi ;                       \
+        bts     $63, %r11 ;                         \
+        imulq   %rsi, %rax ;                        \
+        addq    %rax, %r8 ;                         \
+        adcq    %rcx, %r9 ;                         \
+        adcq    %rcx, %r10 ;                        \
+        adcq    %rcx, %r11 ;                        \
+        sbbq    %rax, %rax ;                        \
+        notq    %rax;                            \
+        andq    %rsi, %rax ;                        \
+        subq    %rax, %r8 ;                         \
+        sbbq    %rcx, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        sbbq    %rcx, %r11 ;                        \
+        btr     $63, %r11 ;                         \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %ebx ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        btr     $0x3f, %r11 ;                      \
+        movl    $0x13, %edx ;                      \
+        imulq   %r12, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Plain 4-digit add and doubling without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+#define double_4(P0,P1)                         \
+        movq    P1, %rax ;                      \
+        addq    %rax, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 24+P0
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        subq    $19, %r8 ;                         \
+        movq    %r8, P0 ;                       \
+        sbbq    $0, %r9 ;                          \
+        movq    %r9, 8+P0 ;                     \
+        sbbq    $0, %r10 ;                         \
+        movq    %r10, 16+P0 ;                   \
+        sbbq    $0, %rax ;                         \
+        btc     $63, %rax ;                        \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38
+// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519.
+
+#define add_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    P2, %r8 ;                       \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    0x8+P2, %r9 ;                   \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    0x10+P2, %r10 ;                 \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    0x18+P2, %r11 ;                 \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Load the constant k_25519 = 2 * d_25519 using immediate operations
+
+#define load_k25519(P0)                         \
+        movq    $0xebd69b9426b2f159, %rax ;        \
+        movq    %rax, P0 ;                      \
+        movq    $0x00e0149a8283b156, %rax ;        \
+        movq    %rax, 8+P0 ;                    \
+        movq    $0x198e80f2eef3d130, %rax ;        \
+        movq    %rax, 16+P0 ;                   \
+        movq    $0x2406d9dc56dffce7, %rax ;        \
+        movq    %rax, 24+P0
+
+S2N_BN_SYMBOL(edwards25519_epadd_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence of operations. after setting up p2 in its register
+
+        movq    %rdx, p2
+
+        mul_4(t0,w_1,w_2)
+
+        sub_4(t1,y_1,x_1)
+        sub_4(t2,y_2,x_2)
+        add_4(t3,y_1,x_1)
+        add_4(t4,y_2,x_2)
+        double_4(t5,z_2)
+
+        mul_4(t1,t1,t2)
+        mul_4(t3,t3,t4)
+
+        load_k25519(t2)
+        mul_4(t2,t2,t0)
+
+        mul_4(t4,z_1,t5)
+
+        sub_twice4(t0,t3,t1)
+        add_twice4(t5,t3,t1)
+        sub_twice4(t1,t4,t2)
+        add_twice4(t3,t4,t2)
+
+        mul_p25519(w_3,t0,t5)
+        mul_p25519(x_3,t0,t1)
+        mul_p25519(y_3,t3,t5)
+        mul_p25519(z_3,t1,t3)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble.S
new file mode 100644
index 00000000000..4472d99a19f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble.S
@@ -0,0 +1,375 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective doubling for edwards25519
+// Input p1[12]; output p3[16]
+//
+// extern void edwards25519_epdouble
+//   (uint64_t p3[static 16],uint64_t p1[static 12])
+//
+// If p1 is a point on edwards25519, returns its double p3 = 2 * p1.
+// The output p3 is in extended projective coordinates, representing
+// affine (x,y) by a quadruple (X,Y,Z,T) where x = X / Z, y = Y / Z
+// and x * y = T / Z. The input p1 may also be in the same extended
+// projective representation, but the final T field is not used so
+// a more basic projective triple (X,Y,Z) suffices.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epdouble)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Registers used for inputs and outputs within basic operations.
+// Here p1 and p3 are where the parameters come in anyway.
+
+#define p3 %rdi
+#define p1 %rsi
+
+// Pointers to input and output coordinates
+
+#define x_1 0(p1)
+#define y_1 NUMSIZE(p1)
+#define z_1 (2*NUMSIZE)(p1)
+
+#define x_3 0(p3)
+#define y_3 NUMSIZE(p3)
+#define z_3 (2*NUMSIZE)(p3)
+#define w_3 (3*NUMSIZE)(p3)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 (0*NUMSIZE)(%rsp)
+#define t1 (1*NUMSIZE)(%rsp)
+#define t2 (2*NUMSIZE)(%rsp)
+#define t3 (3*NUMSIZE)(%rsp)
+#define t4 (4*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (5*NUMSIZE)
+
+// Macro wrapping up the basic field multiplication, only trivially
+// different from a pure function call to bignum_mul_p25519.
+
+#define mul_p25519(P0,P1,P2)                    \
+        xorl   %esi, %esi ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rsi, %r12 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rsi, %r13 ;                        \
+        adcxq  %rsi, %r13 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rsi, %r14 ;                        \
+        adcxq  %rsi, %r14 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rsi, %r15 ;                        \
+        adcxq  %rsi, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %esi, %esi ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rsi, %r12 ;                        \
+        adcxq  %rsi, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        movl   $0x13, %edx ;                       \
+        incq   %r12;                             \
+        bts    $63, %r11 ;                         \
+        mulxq  %r12, %rax, %rbx ;                   \
+        addq   %rax, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rsi, %r10 ;                        \
+        adcq   %rsi, %r11 ;                        \
+        sbbq   %rax, %rax ;                        \
+        notq   %rax;                             \
+        andq   %rdx, %rax ;                        \
+        subq   %rax, %r8 ;                         \
+        sbbq   %rsi, %r9 ;                         \
+        sbbq   %rsi, %r10 ;                        \
+        sbbq   %rsi, %r11 ;                        \
+        btr    $63, %r11 ;                         \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        movq   P1, %rdx ;                       \
+        mulxq  %rdx, %r8, %r15 ;                    \
+        mulxq  0x8+P1, %r9, %r10 ;               \
+        mulxq  0x18+P1, %r11, %r12 ;             \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  0x18+P1, %r13, %r14 ;             \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  P1, %rax, %rcx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rcx, %r12 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rcx, %r13 ;                        \
+        adcxq  %rbx, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        adcq   %rbx, %r14 ;                        \
+        xorl   %ebx, %ebx ;                        \
+        adcxq  %r9, %r9 ;                          \
+        adoxq  %r15, %r9 ;                         \
+        movq   0x8+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r10, %r10 ;                        \
+        adoxq  %rax, %r10 ;                        \
+        adcxq  %r11, %r11 ;                        \
+        adoxq  %rdx, %r11 ;                        \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r12, %r12 ;                        \
+        adoxq  %rax, %r12 ;                        \
+        adcxq  %r13, %r13 ;                        \
+        adoxq  %rdx, %r13 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %r15 ;                   \
+        adcxq  %r14, %r14 ;                        \
+        adoxq  %rax, %r14 ;                        \
+        adcxq  %rbx, %r15 ;                        \
+        adoxq  %rbx, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  %r12, %rax, %rcx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rcx, %r9 ;                         \
+        mulxq  %r13, %rax, %rcx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rcx, %r10 ;                        \
+        mulxq  %r14, %rax, %rcx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        adcxq  %rbx, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    P2, %r8 ;                       \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    0x8+P2, %r9 ;                   \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    0x10+P2, %r10 ;                 \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    0x18+P2, %r11 ;                 \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+#define double_twice4(P0,P1)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r8, %r8 ;                         \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    %r9, %r9 ;                         \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    %r10, %r10 ;                       \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    %r11, %r11 ;                       \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(edwards25519_epdouble):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence
+
+        add_4(t0,x_1,y_1)
+        sqr_4(t1,z_1)
+        sqr_4(t2,x_1)
+        sqr_4(t3,y_1)
+        double_twice4(t1,t1)
+        sqr_4(t0,t0)
+        add_twice4(t4,t2,t3)
+        sub_twice4(t2,t2,t3)
+        add_twice4(t3,t1,t2)
+        sub_twice4(t1,t4,t0)
+        mul_p25519(y_3,t2,t4)
+        mul_p25519(z_3,t3,t2)
+        mul_p25519(w_3,t1,t4)
+        mul_p25519(x_3,t1,t3)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble_alt.S
new file mode 100644
index 00000000000..a335149e0e6
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble_alt.S
@@ -0,0 +1,454 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective doubling for edwards25519
+// Input p1[12]; output p3[16]
+//
+// extern void edwards25519_epdouble
+//   (uint64_t p3[static 16],uint64_t p1[static 12])
+//
+// If p1 is a point on edwards25519, returns its double p3 = 2 * p1.
+// The output p3 is in extended projective coordinates, representing
+// affine (x,y) by a quadruple (X,Y,Z,T) where x = X / Z, y = Y / Z
+// and x * y = T / Z. The input p1 may also be in the same extended
+// projective representation, but the final T field is not used so
+// a more basic projective triple (X,Y,Z) suffices.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epdouble_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Registers used for inputs and outputs within basic operations.
+// Here p1 and p3 are where the parameters come in anyway.
+
+#define p3 %rdi
+#define p1 %rsi
+
+// Pointers to input and output coordinates
+
+#define x_1 0(p1)
+#define y_1 NUMSIZE(p1)
+#define z_1 (2*NUMSIZE)(p1)
+
+#define x_3 0(p3)
+#define y_3 NUMSIZE(p3)
+#define z_3 (2*NUMSIZE)(p3)
+#define w_3 (3*NUMSIZE)(p3)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 (0*NUMSIZE)(%rsp)
+#define t1 (1*NUMSIZE)(%rsp)
+#define t2 (2*NUMSIZE)(%rsp)
+#define t3 (3*NUMSIZE)(%rsp)
+#define t4 (4*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (5*NUMSIZE)
+
+// Macro wrapping up the basic field multiplication, only trivially
+// different from a pure function call to bignum_mul_p25519_alt.
+
+#define mul_p25519(P0,P1,P2)                    \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        leaq    0x1(%r12), %rax ;                  \
+        movl    $0x13, %esi ;                       \
+        bts     $63, %r11 ;                         \
+        imulq   %rsi, %rax ;                        \
+        addq    %rax, %r8 ;                         \
+        adcq    %rcx, %r9 ;                         \
+        adcq    %rcx, %r10 ;                        \
+        adcq    %rcx, %r11 ;                        \
+        sbbq    %rax, %rax ;                        \
+        notq    %rax;                            \
+        andq    %rsi, %rax ;                        \
+        subq    %rax, %r8 ;                         \
+        sbbq    %rcx, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        sbbq    %rcx, %r11 ;                        \
+        btr     $63, %r11 ;                         \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r11 ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r12 ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r14 ;                        \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r15 ;                        \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %ebx ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                  \
+        btr     $0x3f, %r11 ;                      \
+        movl    $0x13, %edx ;                      \
+        imulq   %r12, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    P2, %r8 ;                       \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    0x8+P2, %r9 ;                   \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    0x10+P2, %r10 ;                 \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    0x18+P2, %r11 ;                 \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+#define double_twice4(P0,P1)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r8, %r8 ;                         \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    %r9, %r9 ;                         \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    %r10, %r10 ;                       \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    %r11, %r11 ;                       \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(edwards25519_epdouble_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence
+
+        add_4(t0,x_1,y_1)
+        sqr_4(t1,z_1)
+        sqr_4(t2,x_1)
+        sqr_4(t3,y_1)
+        double_twice4(t1,t1)
+        sqr_4(t0,t0)
+        add_twice4(t4,t2,t3)
+        sub_twice4(t2,t2,t3)
+        add_twice4(t3,t1,t2)
+        sub_twice4(t1,t4,t0)
+        mul_p25519(y_3,t2,t4)
+        mul_p25519(z_3,t3,t2)
+        mul_p25519(w_3,t1,t4)
+        mul_p25519(x_3,t1,t3)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble.S
new file mode 100644
index 00000000000..093d790289b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble.S
@@ -0,0 +1,370 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Projective doubling for edwards25519
+// Input p1[12]; output p3[12]
+//
+// extern void edwards25519_pdouble
+//   (uint64_t p3[static 12],uint64_t p1[static 12])
+//
+// If p1 is a point on edwards25519, returns its double p3 = 2 * p1.
+// Input and output are in pure projective coordinates, representing
+// an affine (x,y) by a triple (X,Y,Z) where x = X / Z, y = Y / Z.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pdouble)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Registers used for inputs and outputs within basic operations.
+// Here p1 and p3 are where the parameters come in anyway.
+
+#define p3 %rdi
+#define p1 %rsi
+
+// Pointers to input and output coordinates
+
+#define x_1 0(p1)
+#define y_1 NUMSIZE(p1)
+#define z_1 (2*NUMSIZE)(p1)
+
+#define x_3 0(p3)
+#define y_3 NUMSIZE(p3)
+#define z_3 (2*NUMSIZE)(p3)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 (0*NUMSIZE)(%rsp)
+#define t1 (1*NUMSIZE)(%rsp)
+#define t2 (2*NUMSIZE)(%rsp)
+#define t3 (3*NUMSIZE)(%rsp)
+#define t4 (4*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (5*NUMSIZE)
+
+// Macro wrapping up the basic field multiplication, only trivially
+// different from a pure function call to bignum_mul_p25519.
+
+#define mul_p25519(P0,P1,P2)                    \
+        xorl   %esi, %esi ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rsi, %r12 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rsi, %r13 ;                        \
+        adcxq  %rsi, %r13 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rsi, %r14 ;                        \
+        adcxq  %rsi, %r14 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rsi, %r15 ;                        \
+        adcxq  %rsi, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %esi, %esi ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rsi, %r12 ;                        \
+        adcxq  %rsi, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        movl   $0x13, %edx ;                       \
+        incq   %r12;                             \
+        bts    $63, %r11 ;                         \
+        mulxq  %r12, %rax, %rbx ;                   \
+        addq   %rax, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rsi, %r10 ;                        \
+        adcq   %rsi, %r11 ;                        \
+        sbbq   %rax, %rax ;                        \
+        notq   %rax;                             \
+        andq   %rdx, %rax ;                        \
+        subq   %rax, %r8 ;                         \
+        sbbq   %rsi, %r9 ;                         \
+        sbbq   %rsi, %r10 ;                        \
+        sbbq   %rsi, %r11 ;                        \
+        btr    $63, %r11 ;                         \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        movq   P1, %rdx ;                       \
+        mulxq  %rdx, %r8, %r15 ;                    \
+        mulxq  0x8+P1, %r9, %r10 ;               \
+        mulxq  0x18+P1, %r11, %r12 ;             \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  0x18+P1, %r13, %r14 ;             \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  P1, %rax, %rcx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rcx, %r12 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  0x8+P1, %rax, %rcx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rcx, %r13 ;                        \
+        adcxq  %rbx, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        adcq   %rbx, %r14 ;                        \
+        xorl   %ebx, %ebx ;                        \
+        adcxq  %r9, %r9 ;                          \
+        adoxq  %r15, %r9 ;                         \
+        movq   0x8+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r10, %r10 ;                        \
+        adoxq  %rax, %r10 ;                        \
+        adcxq  %r11, %r11 ;                        \
+        adoxq  %rdx, %r11 ;                        \
+        movq   0x10+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %rdx ;                   \
+        adcxq  %r12, %r12 ;                        \
+        adoxq  %rax, %r12 ;                        \
+        adcxq  %r13, %r13 ;                        \
+        adoxq  %rdx, %r13 ;                        \
+        movq   0x18+P1, %rdx ;                  \
+        mulxq  %rdx, %rax, %r15 ;                   \
+        adcxq  %r14, %r14 ;                        \
+        adoxq  %rax, %r14 ;                        \
+        adcxq  %rbx, %r15 ;                        \
+        adoxq  %rbx, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %ebx, %ebx ;                        \
+        mulxq  %r12, %rax, %rcx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rcx, %r9 ;                         \
+        mulxq  %r13, %rax, %rcx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rcx, %r10 ;                        \
+        mulxq  %r14, %rax, %rcx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rcx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        adcxq  %rbx, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rbx, %r10 ;                        \
+        adcq   %rbx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    P2, %r8 ;                       \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    0x8+P2, %r9 ;                   \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    0x10+P2, %r10 ;                 \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    0x18+P2, %r11 ;                 \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+#define double_twice4(P0,P1)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r8, %r8 ;                         \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    %r9, %r9 ;                         \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    %r10, %r10 ;                       \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    %r11, %r11 ;                       \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(edwards25519_pdouble):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence
+
+        add_4(t0,x_1,y_1)
+        sqr_4(t1,z_1)
+        sqr_4(t2,x_1)
+        sqr_4(t3,y_1)
+        double_twice4(t1,t1)
+        sqr_4(t0,t0)
+        add_twice4(t4,t2,t3)
+        sub_twice4(t2,t2,t3)
+        add_twice4(t3,t1,t2)
+        sub_twice4(t1,t4,t0)
+        mul_p25519(y_3,t2,t4)
+        mul_p25519(z_3,t3,t2)
+        mul_p25519(x_3,t1,t3)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble_alt.S
new file mode 100644
index 00000000000..4d122cd6284
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble_alt.S
@@ -0,0 +1,449 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Projective doubling for edwards25519
+// Input p1[12]; output p3[12]
+//
+// extern void edwards25519_pdouble
+//   (uint64_t p3[static 12],uint64_t p1[static 12])
+//
+// If p1 is a point on edwards25519, returns its double p3 = 2 * p1.
+// Input and output are in pure projective coordinates, representing
+// an affine (x,y) by a triple (X,Y,Z) where x = X / Z, y = Y / Z.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pdouble_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Registers used for inputs and outputs within basic operations.
+// Here p1 and p3 are where the parameters come in anyway.
+
+#define p3 %rdi
+#define p1 %rsi
+
+// Pointers to input and output coordinates
+
+#define x_1 0(p1)
+#define y_1 NUMSIZE(p1)
+#define z_1 (2*NUMSIZE)(p1)
+
+#define x_3 0(p3)
+#define y_3 NUMSIZE(p3)
+#define z_3 (2*NUMSIZE)(p3)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 (0*NUMSIZE)(%rsp)
+#define t1 (1*NUMSIZE)(%rsp)
+#define t2 (2*NUMSIZE)(%rsp)
+#define t3 (3*NUMSIZE)(%rsp)
+#define t4 (4*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (5*NUMSIZE)
+
+// Macro wrapping up the basic field multiplication, only trivially
+// different from a pure function call to bignum_mul_p25519_alt.
+
+#define mul_p25519(P0,P1,P2)                    \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        leaq    0x1(%r12), %rax ;                  \
+        movl    $0x13, %esi ;                       \
+        bts     $63, %r11 ;                         \
+        imulq   %rsi, %rax ;                        \
+        addq    %rax, %r8 ;                         \
+        adcq    %rcx, %r9 ;                         \
+        adcq    %rcx, %r10 ;                        \
+        adcq    %rcx, %r11 ;                        \
+        sbbq    %rax, %rax ;                        \
+        notq    %rax;                            \
+        andq    %rsi, %rax ;                        \
+        subq    %rax, %r8 ;                         \
+        sbbq    %rcx, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        sbbq    %rcx, %r11 ;                        \
+        btr     $63, %r11 ;                         \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+// Squaring just giving a result < 2 * p_25519, which is done by
+// basically skipping the +1 in the quotient estimate and the final
+// optional correction.
+
+#define sqr_4(P0,P1)                            \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r11 ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r12 ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r13 ;                        \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r14 ;                        \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                        \
+        adcq    %rdx, %rdx ;                        \
+        adcq    $0x0, %r15 ;                        \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %ebx ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                  \
+        btr     $0x3f, %r11 ;                      \
+        movl    $0x13, %edx ;                      \
+        imulq   %r12, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Plain 4-digit add without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38.
+// This only ensures that the result fits in 4 digits, not that it is reduced
+// even w.r.t. double modulus. The result is always correct modulo provided
+// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
+// at least one of them is reduced double modulo.
+
+#define add_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    P2, %r8 ;                       \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    0x8+P2, %r9 ;                   \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    0x10+P2, %r10 ;                 \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    0x18+P2, %r11 ;                 \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+#define double_twice4(P0,P1)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r8, %r8 ;                         \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    %r9, %r9 ;                         \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    %r10, %r10 ;                       \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    %r11, %r11 ;                       \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(edwards25519_pdouble_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence
+
+        add_4(t0,x_1,y_1)
+        sqr_4(t1,z_1)
+        sqr_4(t2,x_1)
+        sqr_4(t3,y_1)
+        double_twice4(t1,t1)
+        sqr_4(t0,t0)
+        add_twice4(t4,t2,t3)
+        sub_twice4(t2,t2,t3)
+        add_twice4(t3,t1,t2)
+        sub_twice4(t1,t4,t0)
+        mul_p25519(y_3,t2,t4)
+        mul_p25519(z_3,t3,t2)
+        mul_p25519(x_3,t1,t3)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd.S
new file mode 100644
index 00000000000..1d68e2add39
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd.S
@@ -0,0 +1,419 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective + precomputed mixed addition for edwards25519
+// Inputs p1[16], p2[12]; output p3[16]
+//
+// extern void edwards25519_pepadd
+//   (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12])
+//
+// The output p3 and the first input p1 are points (x,y) on edwards25519
+// represented in extended projective quadruples (X,Y,Z,T) where
+// x = X / Z, y = Y / Z and x * y = T / Z. The second input p2 is a triple
+// encoding its point (x,y) as (y - x,y + x,2 * d * x * y) where d is the
+// usual Edwards curve parameter for edwards25519.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pepadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pepadd)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Registers used for inputs and outputs within basic operations.
+// Here p1 and p3 are where the parameters come in anyway;
+// the p2 = %rbp assignment is set up at the beginning.
+
+#define p3 %rdi
+#define p1 %rsi
+#define p2 %rbp
+
+// Pointers to input and output coordinates
+
+#define x_1 0(p1)
+#define y_1 NUMSIZE(p1)
+#define z_1 (2*NUMSIZE)(p1)
+#define w_1 (3*NUMSIZE)(p1)
+
+#define ymx_2 0(p2)
+#define xpy_2 NUMSIZE(p2)
+#define kxy_2 (2*NUMSIZE)(p2)
+
+#define x_3 0(p3)
+#define y_3 NUMSIZE(p3)
+#define z_3 (2*NUMSIZE)(p3)
+#define w_3 (3*NUMSIZE)(p3)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 (0*NUMSIZE)(%rsp)
+#define t1 (1*NUMSIZE)(%rsp)
+#define t2 (2*NUMSIZE)(%rsp)
+#define t3 (3*NUMSIZE)(%rsp)
+#define t4 (4*NUMSIZE)(%rsp)
+#define t5 (5*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (6*NUMSIZE)
+
+// Macro wrapping up the basic field multiplication, only trivially
+// different from a pure function call to bignum_mul_p25519.
+
+#define mul_p25519(P0,P1,P2)                    \
+        xorl   %esi, %esi ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rsi, %r12 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rsi, %r13 ;                        \
+        adcxq  %rsi, %r13 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rsi, %r14 ;                        \
+        adcxq  %rsi, %r14 ;                        \
+        xorl   %esi, %esi ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rsi, %r15 ;                        \
+        adcxq  %rsi, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %esi, %esi ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rsi, %r12 ;                        \
+        adcxq  %rsi, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        movl   $0x13, %edx ;                       \
+        incq   %r12;                             \
+        bts    $63, %r11 ;                         \
+        mulxq  %r12, %rax, %rbx ;                   \
+        addq   %rax, %r8 ;                         \
+        adcq   %rbx, %r9 ;                         \
+        adcq   %rsi, %r10 ;                        \
+        adcq   %rsi, %r11 ;                        \
+        sbbq   %rax, %rax ;                        \
+        notq   %rax;                             \
+        andq   %rdx, %rax ;                        \
+        subq   %rax, %r8 ;                         \
+        sbbq   %rsi, %r9 ;                         \
+        sbbq   %rsi, %r10 ;                        \
+        sbbq   %rsi, %r11 ;                        \
+        btr    $63, %r11 ;                         \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        xorl   %ecx, %ecx ;                        \
+        movq   P2, %rdx ;                       \
+        mulxq  P1, %r8, %r9 ;                    \
+        mulxq  0x8+P1, %rax, %r10 ;              \
+        addq   %rax, %r9 ;                         \
+        mulxq  0x10+P1, %rax, %r11 ;             \
+        adcq   %rax, %r10 ;                        \
+        mulxq  0x18+P1, %rax, %r12 ;             \
+        adcq   %rax, %r11 ;                        \
+        adcq   %rcx, %r12 ;                        \
+        xorl   %ecx, %ecx ;                        \
+        movq   0x8+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x18+P1, %rax, %r13 ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rcx, %r13 ;                        \
+        adcxq  %rcx, %r13 ;                        \
+        xorl   %ecx, %ecx ;                        \
+        movq   0x10+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x18+P1, %rax, %r14 ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rcx, %r14 ;                        \
+        adcxq  %rcx, %r14 ;                        \
+        xorl   %ecx, %ecx ;                        \
+        movq   0x18+P2, %rdx ;                  \
+        mulxq  P1, %rax, %rbx ;                  \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rbx, %r12 ;                        \
+        mulxq  0x8+P1, %rax, %rbx ;              \
+        adcxq  %rax, %r12 ;                        \
+        adoxq  %rbx, %r13 ;                        \
+        mulxq  0x10+P1, %rax, %rbx ;             \
+        adcxq  %rax, %r13 ;                        \
+        adoxq  %rbx, %r14 ;                        \
+        mulxq  0x18+P1, %rax, %r15 ;             \
+        adcxq  %rax, %r14 ;                        \
+        adoxq  %rcx, %r15 ;                        \
+        adcxq  %rcx, %r15 ;                        \
+        movl   $0x26, %edx ;                       \
+        xorl   %ecx, %ecx ;                        \
+        mulxq  %r12, %rax, %rbx ;                   \
+        adcxq  %rax, %r8 ;                         \
+        adoxq  %rbx, %r9 ;                         \
+        mulxq  %r13, %rax, %rbx ;                   \
+        adcxq  %rax, %r9 ;                         \
+        adoxq  %rbx, %r10 ;                        \
+        mulxq  %r14, %rax, %rbx ;                   \
+        adcxq  %rax, %r10 ;                        \
+        adoxq  %rbx, %r11 ;                        \
+        mulxq  %r15, %rax, %r12 ;                   \
+        adcxq  %rax, %r11 ;                        \
+        adoxq  %rcx, %r12 ;                        \
+        adcxq  %rcx, %r12 ;                        \
+        shldq  $0x1, %r11, %r12 ;                   \
+        btr    $0x3f, %r11 ;                       \
+        movl   $0x13, %edx ;                       \
+        imulq  %r12, %rdx ;                        \
+        addq   %rdx, %r8 ;                         \
+        adcq   %rcx, %r9 ;                         \
+        adcq   %rcx, %r10 ;                        \
+        adcq   %rcx, %r11 ;                        \
+        movq   %r8, P0 ;                        \
+        movq   %r9, 0x8+P0 ;                    \
+        movq   %r10, 0x10+P0 ;                  \
+        movq   %r11, 0x18+P0
+
+// Plain 4-digit add and doubling without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+#define double_4(P0,P1)                         \
+        movq    P1, %rax ;                      \
+        addq    %rax, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 24+P0
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        subq    $19, %r8 ;                         \
+        movq    %r8, P0 ;                       \
+        sbbq    $0, %r9 ;                          \
+        movq    %r9, 8+P0 ;                     \
+        sbbq    $0, %r10 ;                         \
+        movq    %r10, 16+P0 ;                   \
+        sbbq    $0, %rax ;                         \
+        btc     $63, %rax ;                        \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38
+// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519.
+
+#define add_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    P2, %r8 ;                       \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    0x8+P2, %r9 ;                   \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    0x10+P2, %r10 ;                 \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    0x18+P2, %r11 ;                 \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(edwards25519_pepadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence of operations. after setting up p2 in its register
+
+        movq    %rdx, p2
+
+        double_4(t0,z_1);
+
+        sub_4(t1,y_1,x_1);
+        add_4(t2,y_1,x_1);
+
+        mul_4(t3,w_1,kxy_2);
+
+        mul_4(t1,t1,ymx_2);
+        mul_4(t2,t2,xpy_2);
+
+        sub_twice4(t4,t0,t3);
+        add_twice4(t0,t0,t3);
+        sub_twice4(t5,t2,t1);
+        add_twice4(t1,t2,t1);
+
+        mul_p25519(z_3,t4,t0);
+        mul_p25519(x_3,t5,t4);
+        mul_p25519(y_3,t0,t1);
+        mul_p25519(w_3,t5,t1);
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd_alt.S
new file mode 100644
index 00000000000..46faa373be1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd_alt.S
@@ -0,0 +1,495 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended projective + precomputed mixed addition for edwards25519
+// Inputs p1[16], p2[12]; output p3[16]
+//
+// extern void edwards25519_pepadd_alt
+//   (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12])
+//
+// The output p3 and the first input p1 are points (x,y) on edwards25519
+// represented in extended projective quadruples (X,Y,Z,T) where
+// x = X / Z, y = Y / Z and x * y = T / Z. The second input p2 is a triple
+// encoding its point (x,y) as (y - x,y + x,2 * d * x * y) where d is the
+// usual Edwards curve parameter for edwards25519.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pepadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pepadd_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Registers used for inputs and outputs within basic operations.
+// Here p1 and p3 are where the parameters come in anyway;
+// the p2 = %rbp assignment is set up at the beginning.
+
+#define p3 %rdi
+#define p1 %rsi
+#define p2 %rbp
+
+// Pointers to input and output coordinates
+
+#define x_1 0(p1)
+#define y_1 NUMSIZE(p1)
+#define z_1 (2*NUMSIZE)(p1)
+#define w_1 (3*NUMSIZE)(p1)
+
+#define ymx_2 0(p2)
+#define xpy_2 NUMSIZE(p2)
+#define kxy_2 (2*NUMSIZE)(p2)
+
+#define x_3 0(p3)
+#define y_3 NUMSIZE(p3)
+#define z_3 (2*NUMSIZE)(p3)
+#define w_3 (3*NUMSIZE)(p3)
+
+// Pointer-offset pairs for temporaries on stack
+
+#define t0 (0*NUMSIZE)(%rsp)
+#define t1 (1*NUMSIZE)(%rsp)
+#define t2 (2*NUMSIZE)(%rsp)
+#define t3 (3*NUMSIZE)(%rsp)
+#define t4 (4*NUMSIZE)(%rsp)
+#define t5 (5*NUMSIZE)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (6*NUMSIZE)
+
+// Macro wrapping up the basic field multiplication, only trivially
+// different from a pure function call to bignum_mul_p25519_alt.
+
+#define mul_p25519(P0,P1,P2)                    \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %esi ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rsi;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rsi;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        leaq    0x1(%r12), %rax ;                  \
+        movl    $0x13, %esi ;                       \
+        bts     $63, %r11 ;                         \
+        imulq   %rsi, %rax ;                        \
+        addq    %rax, %r8 ;                         \
+        adcq    %rcx, %r9 ;                         \
+        adcq    %rcx, %r10 ;                        \
+        adcq    %rcx, %r11 ;                        \
+        sbbq    %rax, %rax ;                        \
+        notq    %rax;                            \
+        andq    %rsi, %rax ;                        \
+        subq    %rax, %r8 ;                         \
+        sbbq    %rcx, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        sbbq    %rcx, %r11 ;                        \
+        btr     $63, %r11 ;                         \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+// A version of multiplication that only guarantees output < 2 * p_25519.
+// This basically skips the +1 and final correction in quotient estimation.
+
+#define mul_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                         \
+        movq    %rdx, %r9 ;                         \
+        xorq    %r10, %r10 ;                        \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        xorq    %r12, %r12 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    %r12, %r12 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        adcq    $0x0, %r12 ;                        \
+        xorq    %r13, %r13 ;                        \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    %r13, %r13 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                        \
+        adcq    %rdx, %r12 ;                        \
+        adcq    $0x0, %r13 ;                        \
+        xorq    %r14, %r14 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %r14, %r14 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                        \
+        adcq    %rdx, %r13 ;                        \
+        adcq    $0x0, %r14 ;                        \
+        xorq    %r15, %r15 ;                        \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    %r15, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                        \
+        adcq    %rdx, %r14 ;                        \
+        adcq    $0x0, %r15 ;                        \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                        \
+        adcq    %rdx, %r15 ;                        \
+        movl    $0x26, %ebx ;                       \
+        movq    %r12, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r13, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r9 ;                         \
+        adcq    %rdx, %r10 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r14, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        addq    %rax, %r10 ;                        \
+        adcq    %rdx, %r11 ;                        \
+        sbbq    %rcx, %rcx ;                        \
+        movq    %r15, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                        \
+        xorq    %rcx, %rcx ;                        \
+        addq    %rax, %r11 ;                        \
+        movq    %rdx, %r12 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        shldq   $0x1, %r11, %r12 ;                    \
+        btr     $0x3f, %r11 ;                      \
+        movl    $0x13, %edx ;                      \
+        imulq   %r12, %rdx ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Plain 4-digit add and doubling without any normalization
+// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result,
+// indeed one < 2 * p_25519 for normalized inputs.
+
+#define add_4(P0,P1,P2)                         \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    8+P2, %rax ;                    \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    16+P2, %rax ;                   \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    24+P2, %rax ;                   \
+        movq    %rax, 24+P0
+
+#define double_4(P0,P1)                         \
+        movq    P1, %rax ;                      \
+        addq    %rax, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        movq    8+P1, %rax ;                    \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 8+P0 ;                    \
+        movq    16+P1, %rax ;                   \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 16+P0 ;                   \
+        movq    24+P1, %rax ;                   \
+        adcq    %rax, %rax ;                       \
+        movq    %rax, 24+P0
+
+// Subtraction of a pair of numbers < p_25519 just sufficient
+// to give a 4-digit result. It actually always does (x - z) + (2^255-19)
+// which in turn is done by (x - z) - (2^255+19) discarding the 2^256
+// implicitly
+
+#define sub_4(P0,P1,P2)                         \
+        movq    P1, %r8 ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        subq    $19, %r8 ;                         \
+        movq    %r8, P0 ;                       \
+        sbbq    $0, %r9 ;                          \
+        movq    %r9, 8+P0 ;                     \
+        sbbq    $0, %r10 ;                         \
+        movq    %r10, 16+P0 ;                   \
+        sbbq    $0, %rax ;                         \
+        btc     $63, %rax ;                        \
+        movq    %rax, 24+P0
+
+// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38
+
+#define sub_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ebx, %ebx ;                       \
+        subq    P2, %r8 ;                       \
+        movq    8+P1, %r9 ;                     \
+        sbbq    8+P2, %r9 ;                     \
+        movl    $38, %ecx ;                        \
+        movq    16+P1, %r10 ;                   \
+        sbbq    16+P2, %r10 ;                   \
+        movq    24+P1, %rax ;                   \
+        sbbq    24+P2, %rax ;                   \
+        cmovncq %rbx, %rcx ;                       \
+        subq    %rcx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        sbbq    %rbx, %rax ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %rax, 24+P0
+
+// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38
+// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519.
+
+#define add_twice4(P0,P1,P2)                    \
+        movq    P1, %r8 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    P2, %r8 ;                       \
+        movq    0x8+P1, %r9 ;                   \
+        adcq    0x8+P2, %r9 ;                   \
+        movq    0x10+P1, %r10 ;                 \
+        adcq    0x10+P2, %r10 ;                 \
+        movq    0x18+P1, %r11 ;                 \
+        adcq    0x18+P2, %r11 ;                 \
+        movl    $38, %eax ;                        \
+        cmovncq %rcx, %rax ;                       \
+        addq    %rax, %r8 ;                        \
+        adcq    %rcx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(edwards25519_pepadd_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers, make room for temps, preserve input arguments.
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence of operations. after setting up p2 in its register
+
+        movq    %rdx, p2
+
+        double_4(t0,z_1);
+
+        sub_4(t1,y_1,x_1);
+        add_4(t2,y_1,x_1);
+
+        mul_4(t3,w_1,kxy_2);
+
+        mul_4(t1,t1,ymx_2);
+        mul_4(t2,t2,xpy_2);
+
+        sub_twice4(t4,t0,t3);
+        add_twice4(t0,t0,t3);
+        sub_twice4(t5,t2,t1);
+        add_twice4(t1,t2,t1);
+
+        mul_p25519(z_3,t4,t0);
+        mul_p25519(x_3,t5,t4);
+        mul_p25519(y_3,t0,t1);
+        mul_p25519(w_3,t5,t1);
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase.S
index 6b2a80c7282..a2c8c72a617 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase.S
@@ -336,6 +336,7 @@
         movq    %r11, 0x18+P0
 
 S2N_BN_SYMBOL(edwards25519_scalarmulbase):
+        _CET_ENDBR
 
 // In this case the Windows form literally makes a subroutine call.
 // This avoids hassle arising from keeping code and data together.
diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase_alt.S
index 4796e721891..8ae76964779 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase_alt.S
@@ -412,6 +412,7 @@
         movq    %r11, 0x18+P0
 
 S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt):
+        _CET_ENDBR
 
 // In this case the Windows form literally makes a subroutine call.
 // This avoids hassle arising from keeping code and data together.
diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble.S
index 993c420e056..6de8e992274 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble.S
@@ -434,6 +434,7 @@
         movq    %rax, 24+P0
 
 S2N_BN_SYMBOL(edwards25519_scalarmuldouble):
+        _CET_ENDBR
 
 // In this case the Windows form literally makes a subroutine call.
 // This avoids hassle arising from keeping code and data together.
diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S
index e7c8f7a59dd..23c0ef8aa10 100644
--- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S
@@ -551,6 +551,7 @@
         movq    %rax, 24+P0
 
 S2N_BN_SYMBOL(edwards25519_scalarmuldouble_alt):
+        _CET_ENDBR
 
 // In this case the Windows form literally makes a subroutine call.
 // This avoids hassle arising from keeping code and data together.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_emontredc_8n.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_emontredc_8n.S
new file mode 100644
index 00000000000..adcc9b172d1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_emontredc_8n.S
@@ -0,0 +1,422 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended Montgomery reduce in 8-digit blocks, results in input-output buffer
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+//
+//    extern uint64_t bignum_emontredc_8n
+//     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+//
+// Functionally equivalent to bignum_emontredc (see that file for more detail).
+// But in general assumes that the input k is a multiple of 8.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = w, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = w, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n)
+        .text
+
+// Original input parameters are here
+
+#define z %rsi
+#define w %rcx
+
+// This is copied in early once we stash away k
+
+#define m %rdi
+
+// A variable z pointer
+
+#define zz %rbp
+
+// Stack-based variables
+
+#define carry  (%rsp)
+#define innercount  8(%rsp)
+#define outercount  16(%rsp)
+#define k8m1  24(%rsp)
+
+// -----------------------------------------------------------------------------
+// Standard macros as used in pure multiplier arrays
+// -----------------------------------------------------------------------------
+
+// mulpadd i, j adds z[i] * rdx (now assumed = m[j]) into the window at i+j
+
+.macro mulpadd arg1,arg2
+        mulxq   8*\arg1(z), %rax, %rbx
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcxq   %rax, %r8
+        adoxq   %rbx, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcxq   %rax, %r15
+        adoxq   %rbx, %r8
+.endif
+
+.endm
+
+// addrow i adds z[i] + zz[0..7] * m[j] into the window
+
+.macro addrow arg1
+        movq    8*\arg1(m), %rdx
+        xorl    %eax, %eax // Get a known flag state
+
+.if (\arg1 % 8 == 0)
+        adoxq   8*\arg1(zz), %r8
+.elseif (\arg1 % 8 == 1)
+        adoxq   8*\arg1(zz), %r9
+.elseif (\arg1 % 8 == 2)
+        adoxq   8*\arg1(zz), %r10
+.elseif (\arg1 % 8 == 3)
+        adoxq   8*\arg1(zz), %r11
+.elseif (\arg1 % 8 == 4)
+        adoxq   8*\arg1(zz), %r12
+.elseif (\arg1 % 8 == 5)
+        adoxq   8*\arg1(zz), %r13
+.elseif (\arg1 % 8 == 6)
+        adoxq   8*\arg1(zz), %r14
+.elseif (\arg1 % 8 == 7)
+        adoxq   8*\arg1(zz), %r15
+.endif
+
+        mulpadd 0, \arg1
+
+.if (\arg1 % 8 == 0)
+        movq    %r8, 8*\arg1(zz)
+        movl    $0, %r8d
+.elseif (\arg1 % 8 == 1)
+        movq    %r9, 8*\arg1(zz)
+        movl    $0, %r9d
+.elseif (\arg1 % 8 == 2)
+        movq    %r10, 8*\arg1(zz)
+        movl    $0, %r10d
+.elseif (\arg1 % 8 == 3)
+        movq    %r11, 8*\arg1(zz)
+        movl    $0, %r11d
+.elseif (\arg1 % 8 == 4)
+        movq    %r12, 8*\arg1(zz)
+        movl    $0, %r12d
+.elseif (\arg1 % 8 == 5)
+        movq    %r13, 8*\arg1(zz)
+        movl    $0, %r13d
+.elseif (\arg1 % 8 == 6)
+        movq    %r14, 8*\arg1(zz)
+        movl    $0, %r14d
+.elseif (\arg1 % 8 == 7)
+        movq    %r15, 8*\arg1(zz)
+        movl    $0, %r15d
+.endif
+
+        mulpadd 1, \arg1
+        mulpadd 2, \arg1
+        mulpadd 3, \arg1
+        mulpadd 4, \arg1
+        mulpadd 5, \arg1
+        mulpadd 6, \arg1
+        mulpadd 7, \arg1
+
+.if (\arg1 % 8 == 0)
+        adcq    $0, %r8
+.elseif (\arg1 % 8 == 1)
+        adcq    $0, %r9
+.elseif (\arg1 % 8 == 2)
+        adcq    $0, %r10
+.elseif (\arg1 % 8 == 3)
+        adcq    $0, %r11
+.elseif (\arg1 % 8 == 4)
+        adcq    $0, %r12
+.elseif (\arg1 % 8 == 5)
+        adcq    $0, %r13
+.elseif (\arg1 % 8 == 6)
+        adcq    $0, %r14
+.elseif (\arg1 % 8 == 7)
+        adcq    $0, %r15
+.endif
+
+
+.endm
+
+// -----------------------------------------------------------------------------
+// Anti-matter versions with z and m switched, and also not writing back the z
+// words, but the inverses instead, *and* also adding in the z[0..7] at the
+// beginning. The aim is to use this in Montgomery where we discover z[j]
+// entries as we go along.
+// -----------------------------------------------------------------------------
+
+.macro mulpadda arg1,arg2
+        mulxq   8*\arg1(m), %rax, %rbx
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcxq   %rax, %r8
+        adoxq   %rbx, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcxq   %rax, %r15
+        adoxq   %rbx, %r8
+.endif
+
+.endm
+
+.macro adurowa arg1
+        movq    w, %rdx // Get the word-level modular inverse
+        xorl    %eax, %eax // Get a known flag state
+.if (\arg1 % 8 == 0)
+        mulxq   %r8, %rdx, %rax
+.elseif (\arg1 % 8 == 1)
+        mulxq   %r9, %rdx, %rax
+.elseif (\arg1 % 8 == 2)
+        mulxq   %r10, %rdx, %rax
+.elseif (\arg1 % 8 == 3)
+        mulxq   %r11, %rdx, %rax
+.elseif (\arg1 % 8 == 4)
+        mulxq   %r12, %rdx, %rax
+.elseif (\arg1 % 8 == 5)
+        mulxq   %r13, %rdx, %rax
+.elseif (\arg1 % 8 == 6)
+        mulxq   %r14, %rdx, %rax
+.elseif (\arg1 % 8 == 7)
+        mulxq   %r15, %rdx, %rax
+.endif
+
+        movq    %rdx, 8*\arg1(z) // Store multiplier word
+
+        mulpadda 0, \arg1
+
+        // Note that the bottom reg of the window is zero by construction
+        // So it's safe just to use "mulpadda 7" here
+
+        mulpadda 1, \arg1
+        mulpadda 2, \arg1
+        mulpadda 3, \arg1
+        mulpadda 4, \arg1
+        mulpadda 5, \arg1
+        mulpadda 6, \arg1
+        mulpadda 7, \arg1          // window lowest = 0 beforehand by construction
+
+.if (\arg1 % 8 == 0)
+        adcq    $0, %r8
+.elseif (\arg1 % 8 == 1)
+        adcq    $0, %r9
+.elseif (\arg1 % 8 == 2)
+        adcq    $0, %r10
+.elseif (\arg1 % 8 == 3)
+        adcq    $0, %r11
+.elseif (\arg1 % 8 == 4)
+        adcq    $0, %r12
+.elseif (\arg1 % 8 == 5)
+        adcq    $0, %r13
+.elseif (\arg1 % 8 == 6)
+        adcq    $0, %r14
+.elseif (\arg1 % 8 == 7)
+        adcq    $0, %r15
+.endif
+
+.endm
+
+.macro adurowza
+        movq    w, %rdx // Get the word-level modular inverse
+        xorl    %eax, %eax // Get a known flag state
+
+        movq    (z), %r8 // %r8 = zeroth word
+        mulxq   %r8, %rdx, %rax // Compute multiplier word
+        movq    %rdx, (z) // Store multiplier word
+        movq    8(z), %r9
+
+        mulpadda 0, 0
+        movq    16(z), %r10
+        mulpadda 1, 0
+        movq    24(z), %r11
+        mulpadda 2, 0
+        movq    32(z), %r12
+        mulpadda 3, 0
+        movq    40(z), %r13
+        mulpadda 4, 0
+        movq    48(z), %r14
+        mulpadda 5, 0
+        movq    56(z), %r15
+        mulpadda 6, 0
+        mulpadda 7, 0           // r8 = 0 beforehand by construction
+        adcq    $0, %r8
+.endm
+
+// -----------------------------------------------------------------------------
+// Hybrid top, doing an 8 block specially then multiple additional 8 blocks
+// -----------------------------------------------------------------------------
+
+// Multiply-add: z := z + x[i...i+7] * m
+
+.macro addrows
+
+        adurowza
+        adurowa 1
+        adurowa 2
+        adurowa 3
+        adurowa 4
+        adurowa 5
+        adurowa 6
+        adurowa 7
+
+        movq    z, zz
+
+        movq    k8m1, %rax
+        testq   %rax, %rax
+        jz      bignum_emontredc_8n_innerend
+        movq    %rax, innercount
+bignum_emontredc_8n_innerloop:
+        addq    $64, zz
+        addq    $64, m
+        addrow 0
+        addrow 1
+        addrow 2
+        addrow 3
+        addrow 4
+        addrow 5
+        addrow 6
+        addrow 7
+        subq    $64, innercount
+        jnz     bignum_emontredc_8n_innerloop
+
+        movq    k8m1, %rax
+bignum_emontredc_8n_innerend:
+        subq    %rax, m
+
+        movq    carry, %rbx
+        negq    %rbx
+        adcq    %r8, 64(z,%rax,1)
+        adcq    %r9, 72(z,%rax,1)
+        adcq    %r10, 80(z,%rax,1)
+        adcq    %r11, 88(z,%rax,1)
+        adcq    %r12, 96(z,%rax,1)
+        adcq    %r13, 104(z,%rax,1)
+        adcq    %r14, 112(z,%rax,1)
+        adcq    %r15, 120(z,%rax,1)
+        movl    $0, %eax
+        adcq    $0, %rax
+        movq    %rax, carry
+.endm
+
+// -----------------------------------------------------------------------------
+// Main code.
+// -----------------------------------------------------------------------------
+
+S2N_BN_SYMBOL(bignum_emontredc_8n):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbp
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Pre-initialize the return value to 0 just in case of early exit below
+
+        xorl    %eax, %eax
+
+// Divide the input k by 8, and push k8m1 = (k/8 - 1)<<6 which is used as
+// the scaled inner loop counter / pointer adjustment repeatedly. Also push
+// k/8 itself which is here initializing the outer loop count.
+
+        shrq    $3, %rdi
+        jz      bignum_emontredc_8n_end
+
+        leaq    -1(%rdi), %rbx
+        shlq    $6, %rbx
+        pushq   %rbx
+        pushq   %rdi
+
+// Make space for two more variables, and set between-stages carry to 0
+
+        subq    $16, %rsp
+        movq    $0, carry
+
+// Copy m into its main home
+
+        movq    %rdx, m
+
+// Now just systematically add in the rows
+
+bignum_emontredc_8n_outerloop:
+        addrows
+        addq    $64, z
+        subq    $1, outercount
+        jnz     bignum_emontredc_8n_outerloop
+
+// Pop the carry-out "p", which was stored at [%rsp], put in %rax for return
+
+        popq    %rax
+
+// Adjust the stack
+
+        addq    $24, %rsp
+
+// Reset of epilog
+
+bignum_emontredc_8n_end:
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_16_32.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_16_32.S
new file mode 100644
index 00000000000..b0508f70168
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_16_32.S
@@ -0,0 +1,508 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32]
+//
+//    extern void bignum_kmul_16_32
+//     (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16],
+//      uint64_t t[static 32])
+//
+// In this x86 code the final temporary space argument t is unused, but
+// it is retained in the prototype above for API consistency with ARM.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y, RCX = t
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y, R9 = t
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_16_32)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_16_32)
+        .text
+
+// These parameters are kept where they come in
+
+#define z %rdi
+#define x %rsi
+
+// This one gets moved to free up %rdx for muls
+
+#define y %rcx
+
+// Often used for zero
+
+#define zero %rbp
+#define zeroe %ebp
+
+// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+
+.macro mulpadd arg1,arg2
+        mulxq   8*\arg1(x), %rax, %rbx
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcxq   %rax, %r8
+        adoxq   %rbx, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcxq   %rax, %r15
+        adoxq   %rbx, %r8
+.endif
+
+.endm
+
+// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+// but re-creates the top word assuming nothing to add there
+
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulxq   8*\arg1(x), %rax, %r9
+        adcxq   %rax, %r8
+        adoxq   zero, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulxq   8*\arg1(x), %rax, %r10
+        adcxq   %rax, %r9
+        adoxq   zero, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulxq   8*\arg1(x), %rax, %r11
+        adcxq   %rax, %r10
+        adoxq   zero, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulxq   8*\arg1(x), %rax, %r12
+        adcxq   %rax, %r11
+        adoxq   zero, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulxq   8*\arg1(x), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   zero, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulxq   8*\arg1(x), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   zero, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulxq   8*\arg1(x), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   zero, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulxq   8*\arg1(x), %rax, %r8
+        adcxq   %rax, %r15
+        adoxq   zero, %r8
+.endif
+
+.endm
+
+// addrow i adds z[i] + x[0..7] * y[i] into the window
+
+.macro addrow arg1
+        movq    8*\arg1(y), %rdx
+        xorl    zeroe, zeroe
+
+.if (\arg1 % 8 == 0)
+        adoxq   8*\arg1(z), %r8
+.elseif (\arg1 % 8 == 1)
+        adoxq   8*\arg1(z), %r9
+.elseif (\arg1 % 8 == 2)
+        adoxq   8*\arg1(z), %r10
+.elseif (\arg1 % 8 == 3)
+        adoxq   8*\arg1(z), %r11
+.elseif (\arg1 % 8 == 4)
+        adoxq   8*\arg1(z), %r12
+.elseif (\arg1 % 8 == 5)
+        adoxq   8*\arg1(z), %r13
+.elseif (\arg1 % 8 == 6)
+        adoxq   8*\arg1(z), %r14
+.elseif (\arg1 % 8 == 7)
+        adoxq   8*\arg1(z), %r15
+.endif
+
+        mulpadd 0, \arg1
+
+.if (\arg1 % 8 == 0)
+        movq    %r8, 8*\arg1(z)
+.elseif (\arg1 % 8 == 1)
+        movq    %r9, 8*\arg1(z)
+.elseif (\arg1 % 8 == 2)
+        movq    %r10, 8*\arg1(z)
+.elseif (\arg1 % 8 == 3)
+        movq    %r11, 8*\arg1(z)
+.elseif (\arg1 % 8 == 4)
+        movq    %r12, 8*\arg1(z)
+.elseif (\arg1 % 8 == 5)
+        movq    %r13, 8*\arg1(z)
+.elseif (\arg1 % 8 == 6)
+        movq    %r14, 8*\arg1(z)
+.elseif (\arg1 % 8 == 7)
+        movq    %r15, 8*\arg1(z)
+.endif
+
+        mulpadd 1, \arg1
+        mulpadd 2, \arg1
+        mulpadd 3, \arg1
+        mulpadd 4, \arg1
+        mulpadd 5, \arg1
+        mulpadd 6, \arg1
+        mulpade 7, \arg1
+
+.if (\arg1 % 8 == 0)
+        adcq    zero, %r8
+.elseif (\arg1 % 8 == 1)
+        adcq    zero, %r9
+.elseif (\arg1 % 8 == 2)
+        adcq    zero, %r10
+.elseif (\arg1 % 8 == 3)
+        adcq    zero, %r11
+.elseif (\arg1 % 8 == 4)
+        adcq    zero, %r12
+.elseif (\arg1 % 8 == 5)
+        adcq    zero, %r13
+.elseif (\arg1 % 8 == 6)
+        adcq    zero, %r14
+.elseif (\arg1 % 8 == 7)
+        adcq    zero, %r15
+.endif
+
+.endm
+
+// Special zero version of addrow, setting up the window from scratch
+
+.macro addrowz
+        movq    (y), %rdx
+        xorl    zeroe, zeroe
+
+        mulxq   (x), %rax, %r9
+        adcq    %rax, (z)
+
+        mulxq   8(x), %rax, %r10
+        adcq    %rax, %r9
+
+        mulxq   16(x), %rax, %r11
+        adcq    %rax, %r10
+
+        mulxq   24(x), %rax, %r12
+        adcq    %rax, %r11
+
+        mulxq   32(x), %rax, %r13
+        adcq    %rax, %r12
+
+        mulxq   40(x), %rax, %r14
+        adcq    %rax, %r13
+
+        mulxq   48(x), %rax, %r15
+        adcq    %rax, %r14
+
+        mulxq   56(x), %rax, %r8
+        adcq    %rax, %r15
+
+        adcq    zero, %r8
+.endm
+
+// This is a variant where we add the initial z[0..7] at the outset.
+// This makes the initialization process a bit less wasteful. By doing
+// a block of 8 we get the same effect except that we add z[0..7]
+//
+// adurow i adds 2^{7*64} * z[i+7] + x[0..7] * y[i] into the window
+
+.macro adurow arg1
+        movq    8*\arg1(y), %rdx
+        xorl    zeroe, zeroe
+
+        mulpadd 0, \arg1
+
+.if (\arg1 % 8 == 0)
+        movq    %r8, 8*\arg1(z)
+.elseif (\arg1 % 8 == 1)
+        movq    %r9, 8*\arg1(z)
+.elseif (\arg1 % 8 == 2)
+        movq    %r10, 8*\arg1(z)
+.elseif (\arg1 % 8 == 3)
+        movq    %r11, 8*\arg1(z)
+.elseif (\arg1 % 8 == 4)
+        movq    %r12, 8*\arg1(z)
+.elseif (\arg1 % 8 == 5)
+        movq    %r13, 8*\arg1(z)
+.elseif (\arg1 % 8 == 6)
+        movq    %r14, 8*\arg1(z)
+.elseif (\arg1 % 8 == 7)
+        movq    %r15, 8*\arg1(z)
+.endif
+
+        mulpadd 1, \arg1
+        mulpadd 2, \arg1
+        mulpadd 3, \arg1
+        mulpadd 4, \arg1
+        mulpadd 5, \arg1
+        mulpadd 6, \arg1
+        mulpade 7, \arg1
+
+.if (\arg1 % 8 == 0)
+        adcq    zero, %r8
+.elseif (\arg1 % 8 == 1)
+        adcq    zero, %r9
+.elseif (\arg1 % 8 == 2)
+        adcq    zero, %r10
+.elseif (\arg1 % 8 == 3)
+        adcq    zero, %r11
+.elseif (\arg1 % 8 == 4)
+        adcq    zero, %r12
+.elseif (\arg1 % 8 == 5)
+        adcq    zero, %r13
+.elseif (\arg1 % 8 == 6)
+        adcq    zero, %r14
+.elseif (\arg1 % 8 == 7)
+        adcq    zero, %r15
+.endif
+
+.endm
+
+// Special "adurow 0" case to do first stage
+
+.macro adurowz
+        movq    (y), %rdx
+        xorl    zeroe, zeroe
+
+        movq    (z), %r8
+        movq    8(z), %r9
+
+        mulpadd 0, 0
+        movq    %r8, (z)
+
+        movq    16(z), %r10
+        mulpadd 1, 0
+        movq    24(z), %r11
+        mulpadd 2, 0
+        movq    32(z), %r12
+        mulpadd 3, 0
+        movq    40(z), %r13
+        mulpadd 4, 0
+        movq    48(z), %r14
+        mulpadd 5, 0
+        movq    56(z), %r15
+        mulpadd 6, 0
+
+        mulxq   56(x), %rax, %r8
+        adcxq   %rax, %r15
+        adoxq   zero, %r8
+        adcxq   zero, %r8
+.endm
+
+// Multiply-add: z := z + x[0..7] * y
+
+.macro addrows
+        adurowz
+        adurow  1
+        adurow  2
+        adurow  3
+        adurow  4
+        adurow  5
+        adurow  6
+        adurow  7
+        addrow  8
+        addrow  9
+        addrow  10
+        addrow  11
+        addrow  12
+        addrow  13
+        addrow  14
+        addrow  15
+
+        movq    %r8, 128(z)
+        movq    %r9, 136(z)
+        movq    %r10, 144(z)
+        movq    %r11, 152(z)
+        movq    %r12, 160(z)
+        movq    %r13, 168(z)
+        movq    %r14, 176(z)
+        movq    %r15, 184(z)
+
+.endm
+
+// mulrow i adds x[0..7] * y[i] into the window
+// just like addrow but no addition of z[i]
+
+.macro mulrow arg1
+        movq    8*\arg1(y), %rdx
+        xorl    zeroe, zeroe
+
+        mulpadd 0, \arg1
+
+.if (\arg1 % 8 == 0)
+        movq    %r8, 8*\arg1(z)
+.elseif (\arg1 % 8 == 1)
+        movq    %r9, 8*\arg1(z)
+.elseif (\arg1 % 8 == 2)
+        movq    %r10, 8*\arg1(z)
+.elseif (\arg1 % 8 == 3)
+        movq    %r11, 8*\arg1(z)
+.elseif (\arg1 % 8 == 4)
+        movq    %r12, 8*\arg1(z)
+.elseif (\arg1 % 8 == 5)
+        movq    %r13, 8*\arg1(z)
+.elseif (\arg1 % 8 == 6)
+        movq    %r14, 8*\arg1(z)
+.elseif (\arg1 % 8 == 7)
+        movq    %r15, 8*\arg1(z)
+.endif
+
+        mulpadd 1, \arg1
+        mulpadd 2, \arg1
+        mulpadd 3, \arg1
+        mulpadd 4, \arg1
+        mulpadd 5, \arg1
+        mulpadd 6, \arg1
+        mulpade 7, \arg1
+
+.if (\arg1 % 8 == 0)
+        adcq    zero, %r8
+.elseif (\arg1 % 8 == 1)
+        adcq    zero, %r9
+.elseif (\arg1 % 8 == 2)
+        adcq    zero, %r10
+.elseif (\arg1 % 8 == 3)
+        adcq    zero, %r11
+.elseif (\arg1 % 8 == 4)
+        adcq    zero, %r12
+.elseif (\arg1 % 8 == 5)
+        adcq    zero, %r13
+.elseif (\arg1 % 8 == 6)
+        adcq    zero, %r14
+.elseif (\arg1 % 8 == 7)
+        adcq    zero, %r15
+.endif
+
+
+.endm
+
+// Special zero version of mulrow, setting up the window from scratch
+
+.macro mulrowz
+        movq    (y), %rdx
+        xorl    zeroe, zeroe
+
+        mulxq   (x), %rax, %r9
+        movq    %rax, (z)
+
+        mulxq   8(x), %rax, %r10
+        adcxq    %rax, %r9
+
+        mulxq   16(x), %rax, %r11
+        adcxq   %rax, %r10
+
+        mulxq   24(x), %rax, %r12
+        adcxq   %rax, %r11
+
+        mulxq   32(x), %rax, %r13
+        adcxq   %rax, %r12
+
+        mulxq   40(x), %rax, %r14
+        adcxq   %rax, %r13
+
+        mulxq   48(x), %rax, %r15
+        adcxq   %rax, %r14
+
+        mulxq   56(x), %rax, %r8
+        adcxq   %rax, %r15
+
+        adcq    zero, %r8
+.endm
+
+// Multiply-add: z := x[0..7] * y plus window
+
+.macro mulrows
+        mulrowz
+        mulrow  1
+        mulrow  2
+        mulrow  3
+        mulrow  4
+        mulrow  5
+        mulrow  6
+        mulrow  7
+
+        mulrow  8
+        mulrow  9
+        mulrow  10
+        mulrow  11
+        mulrow  12
+        mulrow  13
+        mulrow  14
+        mulrow  15
+
+        movq    %r8, 128(z)
+        movq    %r9, 136(z)
+        movq    %r10, 144(z)
+        movq    %r11, 152(z)
+        movq    %r12, 160(z)
+        movq    %r13, 168(z)
+        movq    %r14, 176(z)
+        movq    %r15, 184(z)
+
+.endm
+
+
+S2N_BN_SYMBOL(bignum_kmul_16_32):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Move y into its permanent home, freeing up %rdx for its special role in muls
+
+        movq    %rdx, y
+
+// Do the zeroth row as a pure product then the next as multiply-add
+
+        mulrows
+
+        addq    $64, z
+        addq    $64, x
+        addrows
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_32_64.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_32_64.S
new file mode 100644
index 00000000000..640e9ab4733
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_32_64.S
@@ -0,0 +1,1149 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96]
+//
+//    extern void bignum_kmul_32_64
+//     (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32],
+//      uint64_t t[static 96])
+//
+// This is a Karatsuba-style function multiplying half-sized results
+// internally and using temporary buffer t for intermediate results. The size
+// of 96 is an overstatement for compatibility with the ARM version; it
+// actually only uses 65 elements of t (64 + 1 for a stashed sign).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y, RCX = t
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y, R9 = t
+// -----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_32_64)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_32_64)
+        .text
+
+#define K 16
+
+#define z %rdi
+#define x %rsi
+#define y %rcx
+
+#define s %r9
+
+// We re-use the y variable to point at t later on, when this seems clearer
+
+#define t %rcx
+
+S2N_BN_SYMBOL(bignum_kmul_32_64):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save callee-saved registers and also push t onto the stack; we'll
+// use this space to back up both t and later z. Then move the y variable
+// into its longer-term home for the first few stages.
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        pushq   %rcx
+        movq    %rdx, y
+
+// Multiply the low halves
+
+        callq   bignum_kmul_32_64_local_bignum_kmul_16_32
+
+// Multiply the high halves
+
+        leaq    16*K-0x40(%rdi), %rdi
+        leaq    8*K-0x40(%rsi), %rsi
+        leaq    8*K(%rcx), %rcx
+        callq   bignum_kmul_32_64_local_bignum_kmul_16_32
+
+// Establish %r8 as the t pointer and use the cell to back up z now
+
+        movq    (%rsp), %r8
+        subq    $16*K+0x40, %rdi
+        movq    %rdi, (%rsp)
+
+// Form |x_lo - x_hi| starting at t
+
+        movq    -8*K-0x40(%rsi), %rax
+        subq    -8*K-0x40+8*K(%rsi), %rax
+        movq    %rax, (%r8)
+        .set I,  1
+        .rep K-1
+        movq    -8*K-0x40+8*I(%rsi), %rax
+        sbbq    -8*K-0x40+8*K+8*I(%rsi), %rax
+        movq    %rax, 8*I(%r8)
+        .set I,  (I+1)
+        .endr
+
+        movl    $0, %ebx
+        sbbq    s, s // Maintain CF, set ZF for cmovs, record sign
+
+        .set I,  0
+        .rep K
+        movq    8*I(%r8), %rdx
+        movq    %rdx, %rax
+        notq    %rdx
+        cmovzq  %rax, %rdx
+        adcxq   %rbx, %rdx
+        movq    %rdx, 8*I(%r8)
+        .set I,  (I+1)
+        .endr
+
+// Form |y_hi - y_lo| (note opposite order) starting at t[K]
+
+        movq    -8*K+8*K(%rcx), %rax
+        subq    -8*K(%rcx), %rax
+        movq    %rax, 8*K(%r8)
+        .set I,  1
+        .rep K-1
+        movq    -8*K+8*K+8*I(%rcx), %rax
+        sbbq    -8*K+8*I(%rcx), %rax
+        movq    %rax, 8*K+8*I(%r8)
+        .set I,  (I+1)
+        .endr
+
+        movl    $0, %ebx
+        sbbq    %rbp, %rbp // Maintain CF, set ZF for cmovs
+
+        .set I,  0
+        .rep K
+        movq    8*K+8*I(%r8), %rdx
+        movq    %rdx, %rax
+        notq    %rdx
+        cmovzq  %rax, %rdx
+        adcxq   %rbx, %rdx
+        movq    %rdx, 8*K+8*I(%r8)
+        .set I,  (I+1)
+        .endr
+
+// Stash the final sign with which to add things at t[4*K]
+
+        xorq    %rbp, s
+        movq    s, 32*K(%r8)
+
+// Multiply the absolute differences, putting the result at t[2*K]
+// This has the side-effect of putting t in the "right" register %rcx
+// so after the load of z, we have both z and t pointers straight.
+
+        movq    %r8, %rcx
+        leaq    8*K(%r8), %rsi
+        leaq    16*K(%r8), %rdi
+        callq   bignum_kmul_32_64_local_bignum_kmul_16_32
+        movq    (%rsp), z
+
+// Compose the middle parts [2,1] + [1,0] + [3,2], saving carry in %rbx.
+// Put the sum at t, overwriting the absolute differences we no longer need.
+
+        xorl    %ebx, %ebx
+        .set I,  0
+        .rep 2*K
+        movq    8*K+8*I(z), %rax
+        adcxq   8*I(z), %rax
+        adoxq   16*K+8*I(z), %rax
+        movq    %rax, 8*I(t)
+        .set I,  (I+1)
+        .endr
+        adoxq   %rbx, %rbx
+        adcq    $0, %rbx
+
+// Sign-aware addition or subtraction of the complicated term.
+// We double-negate it to set CF/ZF while not spoiling its
+// actual form: note that we eventually adcx to it below.
+
+        movq    32*K(t), s
+        negq    s
+        negq    s
+
+        .set I,  0
+        .rep 2*K
+        movq    16*K+8*I(t), %rdx
+        movq    %rdx, %rax
+        notq    %rdx
+        cmovzq  %rax, %rdx
+        adcxq   8*I(t), %rdx
+        movq    %rdx, 8*K+8*I(z)
+        .set I,  (I+1)
+        .endr
+
+// Bump the accumulated carry. This must end up >= 0 because it's the top
+// word of a value of the form ... + h * h' + l * l' - (h - l) * (h' - l') >= 0
+
+        adcxq   s, %rbx
+
+// Finally propagate the carry to the top part
+
+        xorl    %eax, %eax
+        addq    %rbx, 24*K(z)
+        .set I,  1
+        .rep K-1
+        adcq    %rax, 24*K+8*I(z)
+        .set I,  (I+1)
+        .endr
+
+// Restore and return. The first pop is not needed for the ABI but
+// we need to adjust the stack anyway so it seems reasonable.
+
+        popq    %rcx
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+// Local copy of half-length subroutine. This has a slightly different
+// interface, expecting y argument in %rcx directly, and not doing any
+// save-restore of the other registers. It naturally moves z and x on by
+// 0x40, which we compensate for when it is called by adjusting offsets.
+
+bignum_kmul_32_64_local_bignum_kmul_16_32:
+        movq   (%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %r9
+        movq   %rax, (%rdi)
+        mulxq  0x8(%rsi), %rax, %r10
+        adcxq  %rax, %r9
+        mulxq  0x10(%rsi), %rax, %r11
+        adcxq  %rax, %r10
+        mulxq  0x18(%rsi), %rax, %r12
+        adcxq  %rax, %r11
+        mulxq  0x20(%rsi), %rax, %r13
+        adcxq  %rax, %r12
+        mulxq  0x28(%rsi), %rax, %r14
+        adcxq  %rax, %r13
+        mulxq  0x30(%rsi), %rax, %r15
+        adcxq  %rax, %r14
+        mulxq  0x38(%rsi), %rax, %r8
+        adcxq  %rax, %r15
+        adcq   %rbp, %r8
+        movq   0x8(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        movq   %r9, 0x8(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x38(%rsi), %rax, %r9
+        adcxq  %rax, %r8
+        adoxq  %rbp, %r9
+        adcq   %rbp, %r9
+        movq   0x10(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        movq   %r10, 0x10(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x38(%rsi), %rax, %r10
+        adcxq  %rax, %r9
+        adoxq  %rbp, %r10
+        adcq   %rbp, %r10
+        movq   0x18(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        movq   %r11, 0x18(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x38(%rsi), %rax, %r11
+        adcxq  %rax, %r10
+        adoxq  %rbp, %r11
+        adcq   %rbp, %r11
+        movq   0x20(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        movq   %r12, 0x20(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x38(%rsi), %rax, %r12
+        adcxq  %rax, %r11
+        adoxq  %rbp, %r12
+        adcq   %rbp, %r12
+        movq   0x28(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        movq   %r13, 0x28(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x38(%rsi), %rax, %r13
+        adcxq  %rax, %r12
+        adoxq  %rbp, %r13
+        adcq   %rbp, %r13
+        movq   0x30(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        movq   %r14, 0x30(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x38(%rsi), %rax, %r14
+        adcxq  %rax, %r13
+        adoxq  %rbp, %r14
+        adcq   %rbp, %r14
+        movq   0x38(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        movq   %r15, 0x38(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x38(%rsi), %rax, %r15
+        adcxq  %rax, %r14
+        adoxq  %rbp, %r15
+        adcq   %rbp, %r15
+        movq   0x40(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        movq   %r8, 0x40(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x38(%rsi), %rax, %r8
+        adcxq  %rax, %r15
+        adoxq  %rbp, %r8
+        adcq   %rbp, %r8
+        movq   0x48(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        movq   %r9, 0x48(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x38(%rsi), %rax, %r9
+        adcxq  %rax, %r8
+        adoxq  %rbp, %r9
+        adcq   %rbp, %r9
+        movq   0x50(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        movq   %r10, 0x50(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x38(%rsi), %rax, %r10
+        adcxq  %rax, %r9
+        adoxq  %rbp, %r10
+        adcq   %rbp, %r10
+        movq   0x58(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        movq   %r11, 0x58(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x38(%rsi), %rax, %r11
+        adcxq  %rax, %r10
+        adoxq  %rbp, %r11
+        adcq   %rbp, %r11
+        movq   0x60(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        movq   %r12, 0x60(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x38(%rsi), %rax, %r12
+        adcxq  %rax, %r11
+        adoxq  %rbp, %r12
+        adcq   %rbp, %r12
+        movq   0x68(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        movq   %r13, 0x68(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x38(%rsi), %rax, %r13
+        adcxq  %rax, %r12
+        adoxq  %rbp, %r13
+        adcq   %rbp, %r13
+        movq   0x70(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        movq   %r14, 0x70(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x38(%rsi), %rax, %r14
+        adcxq  %rax, %r13
+        adoxq  %rbp, %r14
+        adcq   %rbp, %r14
+        movq   0x78(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        movq   %r15, 0x78(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x38(%rsi), %rax, %r15
+        adcxq  %rax, %r14
+        adoxq  %rbp, %r15
+        adcq   %rbp, %r15
+        movq   %r8, 0x80(%rdi)
+        movq   %r9, 0x88(%rdi)
+        movq   %r10, 0x90(%rdi)
+        movq   %r11, 0x98(%rdi)
+        movq   %r12, 0xa0(%rdi)
+        movq   %r13, 0xa8(%rdi)
+        movq   %r14, 0xb0(%rdi)
+        movq   %r15, 0xb8(%rdi)
+        addq   $0x40, %rdi
+        addq   $0x40, %rsi
+        movq   (%rcx), %rdx
+        xorl   %ebp, %ebp
+        movq   (%rdi), %r8
+        movq   0x8(%rdi), %r9
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        movq   %r8, (%rdi)
+        movq   0x10(%rdi), %r10
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        movq   0x18(%rdi), %r11
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        movq   0x20(%rdi), %r12
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        movq   0x28(%rdi), %r13
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        movq   0x30(%rdi), %r14
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        movq   0x38(%rdi), %r15
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x38(%rsi), %rax, %r8
+        adcxq  %rax, %r15
+        adoxq  %rbp, %r8
+        adcxq  %rbp, %r8
+        movq   0x8(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        movq   %r9, 0x8(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x38(%rsi), %rax, %r9
+        adcxq  %rax, %r8
+        adoxq  %rbp, %r9
+        adcq   %rbp, %r9
+        movq   0x10(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        movq   %r10, 0x10(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x38(%rsi), %rax, %r10
+        adcxq  %rax, %r9
+        adoxq  %rbp, %r10
+        adcq   %rbp, %r10
+        movq   0x18(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        movq   %r11, 0x18(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x38(%rsi), %rax, %r11
+        adcxq  %rax, %r10
+        adoxq  %rbp, %r11
+        adcq   %rbp, %r11
+        movq   0x20(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        movq   %r12, 0x20(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x38(%rsi), %rax, %r12
+        adcxq  %rax, %r11
+        adoxq  %rbp, %r12
+        adcq   %rbp, %r12
+        movq   0x28(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        movq   %r13, 0x28(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x38(%rsi), %rax, %r13
+        adcxq  %rax, %r12
+        adoxq  %rbp, %r13
+        adcq   %rbp, %r13
+        movq   0x30(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        movq   %r14, 0x30(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x38(%rsi), %rax, %r14
+        adcxq  %rax, %r13
+        adoxq  %rbp, %r14
+        adcq   %rbp, %r14
+        movq   0x38(%rcx), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        movq   %r15, 0x38(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x38(%rsi), %rax, %r15
+        adcxq  %rax, %r14
+        adoxq  %rbp, %r15
+        adcq   %rbp, %r15
+        movq   0x40(%rcx), %rdx
+        xorl   %ebp, %ebp
+        adoxq  0x40(%rdi), %r8
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        movq   %r8, 0x40(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x38(%rsi), %rax, %r8
+        adcxq  %rax, %r15
+        adoxq  %rbp, %r8
+        adcq   %rbp, %r8
+        movq   0x48(%rcx), %rdx
+        xorl   %ebp, %ebp
+        adoxq  0x48(%rdi), %r9
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        movq   %r9, 0x48(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x38(%rsi), %rax, %r9
+        adcxq  %rax, %r8
+        adoxq  %rbp, %r9
+        adcq   %rbp, %r9
+        movq   0x50(%rcx), %rdx
+        xorl   %ebp, %ebp
+        adoxq  0x50(%rdi), %r10
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        movq   %r10, 0x50(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x38(%rsi), %rax, %r10
+        adcxq  %rax, %r9
+        adoxq  %rbp, %r10
+        adcq   %rbp, %r10
+        movq   0x58(%rcx), %rdx
+        xorl   %ebp, %ebp
+        adoxq  0x58(%rdi), %r11
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        movq   %r11, 0x58(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x38(%rsi), %rax, %r11
+        adcxq  %rax, %r10
+        adoxq  %rbp, %r11
+        adcq   %rbp, %r11
+        movq   0x60(%rcx), %rdx
+        xorl   %ebp, %ebp
+        adoxq  0x60(%rdi), %r12
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        movq   %r12, 0x60(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x38(%rsi), %rax, %r12
+        adcxq  %rax, %r11
+        adoxq  %rbp, %r12
+        adcq   %rbp, %r12
+        movq   0x68(%rcx), %rdx
+        xorl   %ebp, %ebp
+        adoxq  0x68(%rdi), %r13
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        movq   %r13, 0x68(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x38(%rsi), %rax, %r13
+        adcxq  %rax, %r12
+        adoxq  %rbp, %r13
+        adcq   %rbp, %r13
+        movq   0x70(%rcx), %rdx
+        xorl   %ebp, %ebp
+        adoxq  0x70(%rdi), %r14
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        movq   %r14, 0x70(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x38(%rsi), %rax, %r14
+        adcxq  %rax, %r13
+        adoxq  %rbp, %r14
+        adcq   %rbp, %r14
+        movq   0x78(%rcx), %rdx
+        xorl   %ebp, %ebp
+        adoxq  0x78(%rdi), %r15
+        mulxq  (%rsi), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        movq   %r15, 0x78(%rdi)
+        mulxq  0x8(%rsi), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x10(%rsi), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x18(%rsi), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x20(%rsi), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x28(%rsi), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x30(%rsi), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x38(%rsi), %rax, %r15
+        adcxq  %rax, %r14
+        adoxq  %rbp, %r15
+        adcq   %rbp, %r15
+        movq   %r8, 0x80(%rdi)
+        movq   %r9, 0x88(%rdi)
+        movq   %r10, 0x90(%rdi)
+        movq   %r11, 0x98(%rdi)
+        movq   %r12, 0xa0(%rdi)
+        movq   %r13, 0xa8(%rdi)
+        movq   %r14, 0xb0(%rdi)
+        movq   %r15, 0xb8(%rdi)
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_16_32.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_16_32.S
new file mode 100644
index 00000000000..86e853d2cb6
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_16_32.S
@@ -0,0 +1,540 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[16]; output z[32]; temporary buffer t[>=24]
+//
+//    extern void bignum_ksqr_16_32
+//     (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]);
+//
+// In this x86 code the final temporary space argument t is unused, but
+// it is retained in the prototype above for API consistency with ARM.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = t
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = t
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_16_32)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_16_32)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// ------------------------------------------------------------------------
+// mulpadd i, j adds rdx * x[i] into the window  at the i+j point
+// ------------------------------------------------------------------------
+
+.macro mulpadd arg1,arg2
+        mulxq   8*\arg1(x), %rax, %rcx
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcxq   %rax, %r8
+        adoxq   %rcx, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcxq   %rax, %r11
+        adoxq   %rcx, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcxq   %rax, %r15
+        adoxq   %rcx, %r8
+.endif
+
+.endm
+
+// ------------------------------------------------------------------------
+// mulpade i, j adds rdx * x[i] into the window at i+j
+// but re-creates the top word assuming nothing to add there
+// ------------------------------------------------------------------------
+
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulxq   8*\arg1(x), %rax, %r9
+        adcxq   %rax, %r8
+        adoxq   zero, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulxq   8*\arg1(x), %rax, %r10
+        adcxq   %rax, %r9
+        adoxq   zero, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulxq   8*\arg1(x), %rax, %r11
+        adcxq   %rax, %r10
+        adoxq   zero, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulxq   8*\arg1(x), %rax, %r12
+        adcxq   %rax, %r11
+        adoxq   zero, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulxq   8*\arg1(x), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   zero, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulxq   8*\arg1(x), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   zero, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulxq   8*\arg1(x), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   zero, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulxq   8*\arg1(x), %rax, %r8
+        adcxq   %rax, %r15
+        adoxq   zero, %r8
+.endif
+
+.endm
+
+// ------------------------------------------------------------------------
+// addrow i,j adds z[i+j] + x[i..i+7] * x[j] into the window
+// ------------------------------------------------------------------------
+
+.macro addrow arg1,arg2
+        movq    8*\arg2(x), %rdx
+        xorl    zeroe, zeroe // Get a known flag state and give a zero reg
+
+.if ((\arg1 + \arg2) % 8 == 0)
+        adoxq   8*(\arg1+\arg2)(z), %r8
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adoxq   8*(\arg1+\arg2)(z), %r9
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adoxq   8*(\arg1+\arg2)(z), %r10
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adoxq   8*(\arg1+\arg2)(z), %r11
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adoxq   8*(\arg1+\arg2)(z), %r12
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adoxq   8*(\arg1+\arg2)(z), %r13
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adoxq   8*(\arg1+\arg2)(z), %r14
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adoxq   8*(\arg1+\arg2)(z), %r15
+.endif
+
+        mulpadd \arg1, \arg2
+
+.if ((\arg1 + \arg2) % 8 == 0)
+        movq    %r8, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        movq    %r9, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        movq    %r10, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        movq    %r11, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        movq    %r12, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        movq    %r13, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        movq    %r14, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        movq    %r15, 8*(\arg1+\arg2)(z)
+.endif
+
+        mulpadd (\arg1+1), \arg2
+        mulpadd (\arg1+2), \arg2
+        mulpadd (\arg1+3), \arg2
+        mulpadd (\arg1+4), \arg2
+        mulpadd (\arg1+5), \arg2
+        mulpade (\arg1+6), \arg2
+        mulpade (\arg1+7), \arg2
+
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcxq   zero, %r8
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcxq   zero, %r9
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcxq   zero, %r10
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcxq   zero, %r11
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcxq   zero, %r12
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcxq   zero, %r13
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcxq   zero, %r14
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcxq   zero, %r15
+.endif
+
+
+.endm
+
+
+// ------------------------------------------------------------------------
+// Adds off-diagonal part of x[i..i+7]^2 into the window, writes 0..7 back
+// ------------------------------------------------------------------------
+
+.macro sqr arg1
+
+        xorl    zeroe, zeroe
+
+// Set up the initial window
+
+        movq    16*\arg1+8(z), %r9
+        movq    16*\arg1+16(z), %r10
+        movq    16*\arg1+24(z), %r11
+        movq    16*\arg1+32(z), %r12
+        movq    16*\arg1+40(z), %r13
+        movq    16*\arg1+48(z), %r14
+        movq    16*\arg1+56(z), %r15
+
+// Add in the first diagonal [%r8..%r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
+
+        movq    8*\arg1(x), %rdx
+        mulpadd (\arg1+1), (\arg1+0)
+        movq    %r9, 16*\arg1+8(z)
+        mulpadd (\arg1+2), (\arg1+0)
+        movq    %r10, 16*\arg1+16(z)
+        mulpadd (\arg1+3), (\arg1+0)
+        mulpadd (\arg1+4), (\arg1+0)
+        mulpadd (\arg1+5), (\arg1+0)
+        mulpadd (\arg1+6), (\arg1+0)
+        mulpade (\arg1+7), (\arg1+0)
+        adcxq   zero, %r8
+
+// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
+
+        xorl    zeroe, zeroe
+        movq    8*\arg1+8(x), %rdx
+        mulpadd (\arg1+2), (\arg1+1)
+        movq    %r11, 16*\arg1+24(z)
+        mulpadd (\arg1+3), (\arg1+1)
+        movq    %r12, 16*\arg1+32(z)
+        mulpadd (\arg1+4), (\arg1+1)
+        mulpadd (\arg1+5), (\arg1+1)
+        mulpadd (\arg1+6), (\arg1+1)
+        mulpade (\arg1+7), (\arg1+1)
+        movq    8*\arg1+32(x), %rdx
+        mulpade (\arg1+5), (\arg1+4)
+        adcxq   zero, %r10
+
+// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
+
+        xorl    zeroe, zeroe
+        movq    8*\arg1+16(x), %rdx
+        mulpadd (\arg1+3), (\arg1+2)
+        movq    %r13, 16*\arg1+40(z)
+        mulpadd (\arg1+4), (\arg1+2)
+        movq    %r14, 16*\arg1+48(z)
+        mulpadd (\arg1+5), (\arg1+2)
+        mulpadd (\arg1+6), (\arg1+2)
+        mulpadd (\arg1+7), (\arg1+2)
+        movq    8*\arg1+48(x), %rdx
+        mulpade (\arg1+4), (\arg1+6)
+        mulpade (\arg1+5), (\arg1+6)
+        adcxq   zero, %r12
+
+// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
+
+        xorl    zeroe, zeroe
+        movq    8*\arg1+24(x), %rdx
+        mulpadd (\arg1+4), (\arg1+3)
+        movq    %r15, 16*\arg1+56(z)
+        mulpadd (\arg1+5), (\arg1+3)
+        mulpadd (\arg1+6), (\arg1+3)
+        mulpadd (\arg1+7), (\arg1+3)
+        movq    8*\arg1+56(x), %rdx
+        mulpadd (\arg1+4), (\arg1+7)
+        mulpade (\arg1+5), (\arg1+7)
+        mulpade (\arg1+6), (\arg1+7)
+        adcxq   zero, %r14
+.endm
+
+// ------------------------------------------------------------------------
+// Multiply-add: z := z + x[i...i+7] * x
+// ------------------------------------------------------------------------
+
+.macro addrows arg1
+
+        sqr \arg1
+
+        .set I,  (\arg1+8)
+.rep (8-\arg1)
+        addrow \arg1, I
+        .set I,  (I+1)
+.endr
+
+        movq    %r8, 8*(16+\arg1)(z)
+        movq    %r9, 8*(17+\arg1)(z)
+        movq    %r10, 8*(18+\arg1)(z)
+        movq    %r11, 8*(19+\arg1)(z)
+        movq    %r12, 8*(20+\arg1)(z)
+        movq    %r13, 8*(21+\arg1)(z)
+        movq    %r14, 8*(22+\arg1)(z)
+.endm
+
+
+// ------------------------------------------------------------------------
+// mulrow i,j adds x[i..i+7] * x[j] into the window
+// just like addrow but no addition of z[i+j]
+// ------------------------------------------------------------------------
+
+.macro mulrow arg1,arg2
+        movq    8*\arg2(x), %rdx
+        xorl    zeroe, zeroe // Get a known flag state and give a zero reg
+
+        mulpadd \arg1, \arg2
+
+.if ((\arg1 + \arg2) % 8 == 0)
+        movq    %r8, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        movq    %r9, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        movq    %r10, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        movq    %r11, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        movq    %r12, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        movq    %r13, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        movq    %r14, 8*(\arg1+\arg2)(z)
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        movq    %r15, 8*(\arg1+\arg2)(z)
+.endif
+
+        mulpadd (\arg1+1), \arg2
+        mulpadd (\arg1+2), \arg2
+        mulpadd (\arg1+3), \arg2
+        mulpadd (\arg1+4), \arg2
+        mulpadd (\arg1+5), \arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulpade (\arg1+6), \arg2
+.else
+        mulpadd (\arg1+6), \arg2
+.endif
+
+        mulpade (\arg1+7), \arg2
+
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcxq   zero, %r8
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcxq   zero, %r9
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcxq   zero, %r10
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcxq   zero, %r11
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcxq   zero, %r12
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcxq   zero, %r13
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcxq   zero, %r14
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcxq   zero, %r15
+.endif
+
+
+.endm
+
+// ------------------------------------------------------------------------
+// Compute off-diagonal part of x[0..7]^2, write back 1..7 elements and
+// set up the high part in the standard register window. DOES NOT WRITE z[0]!
+// ------------------------------------------------------------------------
+
+.macro sqrz
+
+        xorl    zeroe, zeroe
+
+// Set initial window [%r8..%r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
+
+        movq    (x), %rdx
+        mulxq   8(x), %r9, %rax
+        movq    %r9, 8(z)
+        mulxq   16(x), %r10, %rcx
+        adcxq   %rax, %r10
+        movq    %r10, 16(z)
+        mulxq   24(x), %r11, %rax
+        adcxq   %rcx, %r11
+        mulxq   32(x), %r12, %rcx
+        adcxq   %rax, %r12
+        mulxq   40(x), %r13, %rax
+        adcxq   %rcx, %r13
+        mulxq   48(x), %r14, %rcx
+        adcxq   %rax, %r14
+        mulxq   56(x), %r15, %r8
+        adcxq   %rcx, %r15
+        adcxq   zero, %r8
+
+// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
+
+        xorl    zeroe, zeroe
+        movq    8(x), %rdx
+        mulpadd 2, 1
+        movq    %r11, 24(z)
+        mulpadd 3, 1
+        movq    %r12, 32(z)
+        mulpadd 4, 1
+        mulpadd 5, 1
+        mulpadd 6, 1
+        mulpade 7, 1
+        movq    32(x), %rdx
+        mulpade 5, 4
+        adcxq   zero, %r10
+
+// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
+
+        xorl    zeroe, zeroe
+        movq    16(x), %rdx
+        mulpadd 3, 2
+        movq    %r13, 40(z)
+        mulpadd 4, 2
+        movq    %r14, 48(z)
+        mulpadd 5, 2
+        mulpadd 6, 2
+        mulpadd 7, 2
+        movq    48(x), %rdx
+        mulpade 4, 6
+        mulpade 5, 6
+        adcxq   zero, %r12
+
+// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
+
+        xorl    zeroe, zeroe
+        movq    24(x), %rdx
+        mulpadd 4, 3
+        movq    %r15, 56(z)
+        mulpadd 5, 3
+        mulpadd 6, 3
+        mulpadd 7, 3
+        movq    56(x), %rdx
+        mulpadd 4, 7
+        mulpade 5, 7
+        mulpade 6, 7
+        adcxq   zero, %r14
+.endm
+
+// ------------------------------------------------------------------------
+// Multiply-add: z := x[0...7] * x off-diagonal elements
+// ------------------------------------------------------------------------
+
+.macro mulrows
+        sqrz
+
+        .set I,  8
+.rep 8
+        mulrow 0, I
+        .set I,  (I+1)
+.endr
+
+        movq    %r8, 128(z)
+        movq    %r9, 136(z)
+        movq    %r10, 144(z)
+        movq    %r11, 152(z)
+        movq    %r12, 160(z)
+        movq    %r13, 168(z)
+        movq    %r14, 176(z)
+        movq    %r15, 184(z)
+.endm
+
+// ------------------------------------------------------------------------
+// The actual code
+// ------------------------------------------------------------------------
+
+
+
+S2N_BN_SYMBOL(bignum_ksqr_16_32):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Now just systematically add in the rows to get all off-diagonal elements
+
+        mulrows
+        addrows 8
+
+// Double and add the diagonal elements. Note that z[0] was never written above
+
+        xorl    zeroe, zeroe
+        movq    (x), %rdx
+        mulxq   %rdx, %rax, %rcx
+        movq    %rax, (z)
+
+        movq    8(z), %rdx
+        adcxq   %rdx, %rdx
+        adoxq   %rcx, %rdx
+        movq    %rdx, 8(z)
+
+        .set I,  1
+.rep 14
+        movq    8*I(x), %rdx
+        mulxq   %rdx, %rax, %rcx
+
+        movq    8*(2*I)(z), %rdx
+        adcxq   %rdx, %rdx
+        adoxq   %rax, %rdx
+        movq    %rdx, 8*(2*I)(z)
+
+        movq    8*(2*I+1)(z), %rdx
+        adcxq   %rdx, %rdx
+        adoxq   %rcx, %rdx
+        movq    %rdx, 8*(2*I+1)(z)
+        .set I,  (I+1)
+.endr
+
+        movq    8*I(x), %rdx
+        mulxq   %rdx, %rax, %rcx
+
+        movq    8*(2*I)(z), %rdx
+        adcxq   %rdx, %rdx
+        adoxq   %rax, %rdx
+        movq    %rdx, 8*(2*I)(z)
+
+        adcxq   zero, %rcx
+        adoxq   zero, %rcx
+        movq    %rcx, 8*(2*I+1)(z)
+        .set I,  (I+1)
+
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_32_64.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_32_64.S
new file mode 100644
index 00000000000..00956d919e4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_32_64.S
@@ -0,0 +1,798 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[32]; output z[64]; temporary buffer t[>=72]
+//
+//    extern void bignum_ksqr_32_64
+//     (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]);
+//
+// This is a Karatsuba-style function squaring half-sized results
+// and using temporary buffer t for intermediate results. The size of 72
+// is an overstatement for compatibility with the ARM version; it actually
+// only uses 65 elements of t (64 + 1 for a suspended carry).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = t
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = t
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_32_64)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_32_64)
+        .text
+
+#define K 16
+
+#define z %rdi
+#define x %rsi
+#define t %rcx
+
+S2N_BN_SYMBOL(bignum_ksqr_32_64):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save callee-preserved registers once and for all at the outset
+// Later we further reshuffle the input arguments to avoid extra saves
+
+        pushq  %rbp
+        pushq  %rbx
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+// Move the temp space pointer since we need %rdx for multiplications
+
+        movq    %rdx, t
+
+// Square the low half
+
+        callq   bignum_ksqr_32_64_local_bignum_sqr_16_32
+
+// Square the high half; from here on x and z are modified
+
+        leaq    8*K(x), x // input at x+8*K
+        leaq    16*K(z), z // result at z+16*K
+        callq   bignum_ksqr_32_64_local_bignum_sqr_16_32
+
+// Form |x_lo - x_hi|, stored at t
+
+        movq    -8*K(x), %rax
+        subq    (x), %rax
+        movq    %rax, (t)
+        .set I,  1
+        .rep K-1
+        movq    -8*K+8*I(x), %rax
+        sbbq    8*I(x), %rax
+        movq    %rax, 8*I(t)
+        .set I,  (I+1)
+        .endr
+
+        movl    $0, %ebx
+        sbbq    %rax, %rax // Maintain CF, set ZF for cmovs
+
+        .set I,  0
+        .rep K
+        movq    8*I(t), %rdx
+        movq    %rdx, %rax
+        notq    %rdx
+        adcxq   %rbx, %rdx
+        cmovzq  %rax, %rdx
+        movq    %rdx, 8*I(t)
+        .set I,  (I+1)
+        .endr
+
+// Compose the middle parts [2,1] + [1,0] + [3,2]
+// Put the low half of this at t[K] and the top half in place at z[2*K]; a
+// fully in-place version is awkward with the otherwise beneficial double
+// carry chain. Stash the carry suspended from the 3k position at the end of
+// the temp buffer t[4*K].
+
+        xorl    %edx, %edx
+        .set I,  0
+        .rep K
+        movq    -16*K+8*K+8*I(z), %rax
+        adcxq   -16*K+8*I(z), %rax
+        adoxq   -16*K+16*K+8*I(z), %rax
+        movq    %rax, 8*K+8*I(t)
+        .set I,  (I+1)
+        .endr
+
+        .rep K
+        movq    -16*K+8*K+8*I(z), %rax
+        adcxq   -16*K+8*I(z), %rax
+        adoxq   -16*K+16*K+8*I(z), %rax
+        movq    %rax, -16*K+8*K+8*I(z)
+        .set I,  (I+1)
+        .endr
+
+        adoxq   %rdx, %rdx
+        adcq    $0, %rdx
+        movq    %rdx, 32*K(t)
+
+// Square the absolute difference, putting the result M at t[2*K].
+// This involves another shuffle so now t' = z_orig and x' = t_orig
+// while z' points within the temp buffer to the product M itself
+
+        movq    t, x
+        leaq    -16*K(z), t
+        leaq    16*K(x), z
+        callq   bignum_ksqr_32_64_local_bignum_sqr_16_32
+
+// Subtract M, pausing at the 3k position to bump down accumulated carry.
+// The carry cannot go negative since it's the top word of a value
+// of the form ... + h^2 + l^2 - (h - l)^2 >= 0
+
+        movq    8*K(x), %rax
+        subq    (z), %rax
+        movq    %rax, 8*K(t)
+
+        .set I,  1
+
+        .rep (K-1)
+        movq    8*K+8*I(x), %rax
+        sbbq    8*I(z), %rax
+        movq    %rax, 8*K+8*I(t)
+        .set I,  (I+1)
+        .endr
+
+        .rep K
+        movq    8*K+8*I(t), %rax
+        sbbq    8*I(z), %rax
+        movq    %rax, 8*K+8*I(t)
+        .set I,  (I+1)
+        .endr
+
+        movq    32*K(x), %rdx
+        sbbq    $0, %rdx
+
+// Finally propagate the carry to the top quarter
+
+        xorl    %eax, %eax
+        addq    %rdx, 24*K(t)
+        .set I,  1
+        .rep K-1
+        adcq    %rax, 24*K+8*I(t)
+        .set I,  (I+1)
+        .endr
+
+// Restore registers and return
+
+        popq   %r15
+        popq   %r14
+        popq   %r13
+        popq   %r12
+        popq   %rbx
+        popq   %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+// Local copy of the half-length subroutine
+
+bignum_ksqr_32_64_local_bignum_sqr_16_32:
+        xorl   %ebp, %ebp
+        movq   (x), %rdx
+        mulxq  0x8(x), %r9, %rax
+        movq   %r9, 0x8(z)
+        mulxq  0x10(x), %r10, %rbx
+        adcxq  %rax, %r10
+        movq   %r10, 0x10(z)
+        mulxq  0x18(x), %r11, %rax
+        adcxq  %rbx, %r11
+        mulxq  0x20(x), %r12, %rbx
+        adcxq  %rax, %r12
+        mulxq  0x28(x), %r13, %rax
+        adcxq  %rbx, %r13
+        mulxq  0x30(x), %r14, %rbx
+        adcxq  %rax, %r14
+        mulxq  0x38(x), %r15, %r8
+        adcxq  %rbx, %r15
+        adcxq  %rbp, %r8
+        xorl   %ebp, %ebp
+        movq   0x8(x), %rdx
+        mulxq  0x10(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        movq   %r11, 0x18(z)
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        movq   %r12, 0x20(z)
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x38(x), %rax, %r9
+        adcxq  %rax, %r8
+        adoxq  %rbp, %r9
+        movq   0x20(x), %rdx
+        mulxq  0x28(x), %rax, %r10
+        adcxq  %rax, %r9
+        adoxq  %rbp, %r10
+        adcxq  %rbp, %r10
+        xorl   %ebp, %ebp
+        movq   0x10(x), %rdx
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        movq   %r13, 0x28(z)
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        movq   %r14, 0x30(z)
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x38(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        movq   0x30(x), %rdx
+        mulxq  0x20(x), %rax, %r11
+        adcxq  %rax, %r10
+        adoxq  %rbp, %r11
+        mulxq  0x28(x), %rax, %r12
+        adcxq  %rax, %r11
+        adoxq  %rbp, %r12
+        adcxq  %rbp, %r12
+        xorl   %ebp, %ebp
+        movq   0x18(x), %rdx
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        movq   %r15, 0x38(z)
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x38(x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        movq   0x38(x), %rdx
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x28(x), %rax, %r13
+        adcxq  %rax, %r12
+        adoxq  %rbp, %r13
+        mulxq  0x30(x), %rax, %r14
+        adcxq  %rax, %r13
+        adoxq  %rbp, %r14
+        adcxq  %rbp, %r14
+        movq   0x40(x), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        movq   %r8, 0x40(z)
+        mulxq  0x8(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x10(x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x30(x), %rax, %r15
+        adcxq  %rax, %r14
+        adoxq  %rbp, %r15
+        mulxq  0x38(x), %rax, %r8
+        adcxq  %rax, %r15
+        adoxq  %rbp, %r8
+        adcxq  %rbp, %r8
+        movq   0x48(x), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        movq   %r9, 0x48(z)
+        mulxq  0x8(x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x10(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x38(x), %rax, %r9
+        adcxq  %rax, %r8
+        adoxq  %rbp, %r9
+        adcxq  %rbp, %r9
+        movq   0x50(x), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        movq   %r10, 0x50(z)
+        mulxq  0x8(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x10(x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x38(x), %rax, %r10
+        adcxq  %rax, %r9
+        adoxq  %rbp, %r10
+        adcxq  %rbp, %r10
+        movq   0x58(x), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        movq   %r11, 0x58(z)
+        mulxq  0x8(x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x10(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x38(x), %rax, %r11
+        adcxq  %rax, %r10
+        adoxq  %rbp, %r11
+        adcxq  %rbp, %r11
+        movq   0x60(x), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        movq   %r12, 0x60(z)
+        mulxq  0x8(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x10(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x38(x), %rax, %r12
+        adcxq  %rax, %r11
+        adoxq  %rbp, %r12
+        adcxq  %rbp, %r12
+        movq   0x68(x), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        movq   %r13, 0x68(z)
+        mulxq  0x8(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x10(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x38(x), %rax, %r13
+        adcxq  %rax, %r12
+        adoxq  %rbp, %r13
+        adcxq  %rbp, %r13
+        movq   0x70(x), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        movq   %r14, 0x70(z)
+        mulxq  0x8(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x10(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x38(x), %rax, %r14
+        adcxq  %rax, %r13
+        adoxq  %rbp, %r14
+        adcxq  %rbp, %r14
+        movq   0x78(x), %rdx
+        xorl   %ebp, %ebp
+        mulxq  (x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        movq   %r15, 0x78(z)
+        mulxq  0x8(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x10(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x18(x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        mulxq  0x20(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x28(x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x30(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x38(x), %rax, %r15
+        adcxq  %rax, %r14
+        adoxq  %rbp, %r15
+        adcxq  %rbp, %r15
+        movq   %r8, 0x80(z)
+        movq   %r9, 0x88(z)
+        movq   %r10, 0x90(z)
+        movq   %r11, 0x98(z)
+        movq   %r12, 0xa0(z)
+        movq   %r13, 0xa8(z)
+        movq   %r14, 0xb0(z)
+        movq   %r15, 0xb8(z)
+        xorl   %ebp, %ebp
+        movq   0x88(z), %r9
+        movq   0x90(z), %r10
+        movq   0x98(z), %r11
+        movq   0xa0(z), %r12
+        movq   0xa8(z), %r13
+        movq   0xb0(z), %r14
+        movq   0xb8(z), %r15
+        movq   0x40(x), %rdx
+        mulxq  0x48(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        movq   %r9, 0x88(z)
+        mulxq  0x50(x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        movq   %r10, 0x90(z)
+        mulxq  0x58(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x60(x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        mulxq  0x68(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x70(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x78(x), %rax, %r8
+        adcxq  %rax, %r15
+        adoxq  %rbp, %r8
+        adcxq  %rbp, %r8
+        xorl   %ebp, %ebp
+        movq   0x48(x), %rdx
+        mulxq  0x50(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        movq   %r11, 0x98(z)
+        mulxq  0x58(x), %rax, %rbx
+        adcxq  %rax, %r12
+        adoxq  %rbx, %r13
+        movq   %r12, 0xa0(z)
+        mulxq  0x60(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        mulxq  0x68(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        mulxq  0x70(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x78(x), %rax, %r9
+        adcxq  %rax, %r8
+        adoxq  %rbp, %r9
+        movq   0x60(x), %rdx
+        mulxq  0x68(x), %rax, %r10
+        adcxq  %rax, %r9
+        adoxq  %rbp, %r10
+        adcxq  %rbp, %r10
+        xorl   %ebp, %ebp
+        movq   0x50(x), %rdx
+        mulxq  0x58(x), %rax, %rbx
+        adcxq  %rax, %r13
+        adoxq  %rbx, %r14
+        movq   %r13, 0xa8(z)
+        mulxq  0x60(x), %rax, %rbx
+        adcxq  %rax, %r14
+        adoxq  %rbx, %r15
+        movq   %r14, 0xb0(z)
+        mulxq  0x68(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        mulxq  0x70(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x78(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        movq   0x70(x), %rdx
+        mulxq  0x60(x), %rax, %r11
+        adcxq  %rax, %r10
+        adoxq  %rbp, %r11
+        mulxq  0x68(x), %rax, %r12
+        adcxq  %rax, %r11
+        adoxq  %rbp, %r12
+        adcxq  %rbp, %r12
+        xorl   %ebp, %ebp
+        movq   0x58(x), %rdx
+        mulxq  0x60(x), %rax, %rbx
+        adcxq  %rax, %r15
+        adoxq  %rbx, %r8
+        movq   %r15, 0xb8(z)
+        mulxq  0x68(x), %rax, %rbx
+        adcxq  %rax, %r8
+        adoxq  %rbx, %r9
+        mulxq  0x70(x), %rax, %rbx
+        adcxq  %rax, %r9
+        adoxq  %rbx, %r10
+        mulxq  0x78(x), %rax, %rbx
+        adcxq  %rax, %r10
+        adoxq  %rbx, %r11
+        movq   0x78(x), %rdx
+        mulxq  0x60(x), %rax, %rbx
+        adcxq  %rax, %r11
+        adoxq  %rbx, %r12
+        mulxq  0x68(x), %rax, %r13
+        adcxq  %rax, %r12
+        adoxq  %rbp, %r13
+        mulxq  0x70(x), %rax, %r14
+        adcxq  %rax, %r13
+        adoxq  %rbp, %r14
+        adcxq  %rbp, %r14
+        movq   %r8, 0xc0(z)
+        movq   %r9, 0xc8(z)
+        movq   %r10, 0xd0(z)
+        movq   %r11, 0xd8(z)
+        movq   %r12, 0xe0(z)
+        movq   %r13, 0xe8(z)
+        movq   %r14, 0xf0(z)
+        xorl   %ebp, %ebp
+        movq   (x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   %rax, (z)
+        movq   0x8(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x8(z)
+        movq   0x8(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0x10(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0x10(z)
+        movq   0x18(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x18(z)
+        movq   0x10(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0x20(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0x20(z)
+        movq   0x28(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x28(z)
+        movq   0x18(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0x30(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0x30(z)
+        movq   0x38(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x38(z)
+        movq   0x20(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0x40(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0x40(z)
+        movq   0x48(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x48(z)
+        movq   0x28(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0x50(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0x50(z)
+        movq   0x58(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x58(z)
+        movq   0x30(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0x60(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0x60(z)
+        movq   0x68(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x68(z)
+        movq   0x38(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0x70(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0x70(z)
+        movq   0x78(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x78(z)
+        movq   0x40(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0x80(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0x80(z)
+        movq   0x88(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x88(z)
+        movq   0x48(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0x90(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0x90(z)
+        movq   0x98(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0x98(z)
+        movq   0x50(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0xa0(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0xa0(z)
+        movq   0xa8(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0xa8(z)
+        movq   0x58(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0xb0(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0xb0(z)
+        movq   0xb8(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0xb8(z)
+        movq   0x60(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0xc0(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0xc0(z)
+        movq   0xc8(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0xc8(z)
+        movq   0x68(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0xd0(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0xd0(z)
+        movq   0xd8(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0xd8(z)
+        movq   0x70(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0xe0(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0xe0(z)
+        movq   0xe8(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rbx, %rdx
+        movq   %rdx, 0xe8(z)
+        movq   0x78(x), %rdx
+        mulxq  %rdx, %rax, %rbx
+        movq   0xf0(z), %rdx
+        adcxq  %rdx, %rdx
+        adoxq  %rax, %rdx
+        movq   %rdx, 0xf0(z)
+        adcxq  %rbp, %rbx
+        adoxq  %rbp, %rbx
+        movq   %rbx, 0xf8(z)
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8.S
new file mode 100644
index 00000000000..1df6d6c9d88
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8.S
@@ -0,0 +1,174 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+//    extern void bignum_mul_4_8
+//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// Copied in or set up
+
+#define y %rcx
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// Add in x[i] * %rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+
+.macro mulpadd arg1,arg2
+        mulxq   8*\arg2(x), %rax, %rbx
+.if ((\arg1 + \arg2) % 4 == 0)
+        adcxq   %rax, %r8
+        adoxq   %rbx, %r9
+.elseif ((\arg1 + \arg2) % 4 == 1)
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+.elseif ((\arg1 + \arg2) % 4 == 2)
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+.elseif ((\arg1 + \arg2) % 4 == 3)
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r8
+.endif
+
+.endm
+
+
+// Add in the whole j'th row
+
+.macro addrow arg1
+        movq    8*\arg1(y), %rdx
+        xorl    zeroe, zeroe
+
+        mulpadd \arg1, 0
+
+.if (\arg1 % 4 == 0)
+        movq    %r8, 8*\arg1(z)
+.elseif (\arg1 % 4 == 1)
+        movq    %r9, 8*\arg1(z)
+.elseif (\arg1 % 4 == 2)
+        movq    %r10, 8*\arg1(z)
+.elseif (\arg1 % 4 == 3)
+        movq    %r11, 8*\arg1(z)
+.endif
+
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+
+.if (\arg1 % 4 == 0)
+        mulxq   24(x), %rax, %r8
+        adcxq   %rax, %r11
+        adoxq   zero, %r8
+        adcxq   zero, %r8
+.elseif (\arg1 % 4 == 1)
+        mulxq   24(x), %rax, %r9
+        adcxq   %rax, %r8
+        adoxq   zero, %r9
+        adcxq   zero, %r9
+.elseif (\arg1 % 4 == 2)
+        mulxq   24(x), %rax, %r10
+        adcxq   %rax, %r9
+        adoxq   zero, %r10
+        adcxq   zero, %r10
+.elseif (\arg1 % 4 == 3)
+        mulxq   24(x), %rax, %r11
+        adcxq   %rax, %r10
+        adoxq   zero, %r11
+        adcxq   zero, %r11
+.endif
+
+.endm
+
+
+
+S2N_BN_SYMBOL(bignum_mul_4_8):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbp
+        pushq   %rbx
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Zero a register, which also makes sure we don't get a fake carry-in
+
+        xorl    zeroe, zeroe
+
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// %r8,%r11,%r10,%r9 as y[0] * x from 1..4
+
+        movq    (y), %rdx
+
+        mulxq   (x), %r8, %r9
+        movq    %r8, (z)
+
+        mulxq   8(x), %rbx, %r10
+        adcxq   %rbx, %r9
+
+        mulxq   16(x), %rbx, %r11
+        adcxq   %rbx, %r10
+
+        mulxq   24(x), %rbx, %r8
+        adcxq   %rbx, %r11
+        adcxq   zero, %r8
+
+// Now all the other rows in a uniform pattern
+
+        addrow  1
+        addrow  2
+        addrow  3
+
+// Now write back the additional columns
+
+        movq    %r8, 32(z)
+        movq    %r9, 40(z)
+        movq    %r10, 48(z)
+        movq    %r11, 56(z)
+
+// Restore registers and return
+
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8_alt.S
new file mode 100644
index 00000000000..4730daa6751
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8_alt.S
@@ -0,0 +1,146 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+//    extern void bignum_mul_4_8_alt
+//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// This is moved from %rdx to free it for muls
+
+#define y %rcx
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 %r8
+#define t1 %r9
+#define t2 %r10
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+S2N_BN_SYMBOL(bignum_mul_4_8_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Result term 0
+
+        movq    (x), %rax
+        mulq     (y)
+
+        movq    %rax, (z)
+        movq    %rdx, t0
+        xorq    t1, t1
+
+// Result term 1
+
+        xorq    t2, t2
+        combads(t1,t0,(x),8(y))
+        combadz(t2,t1,t0,8(x),(y))
+        movq    t0, 8(z)
+
+// Result term 2
+
+        xorq    t0, t0
+        combadz(t0,t2,t1,(x),16(y))
+        combadd(t0,t2,t1,8(x),8(y))
+        combadd(t0,t2,t1,16(x),(y))
+        movq    t1, 16(z)
+
+// Result term 3
+
+        xorq    t1, t1
+        combadz(t1,t0,t2,(x),24(y))
+        combadd(t1,t0,t2,8(x),16(y))
+        combadd(t1,t0,t2,16(x),8(y))
+        combadd(t1,t0,t2,24(x),(y))
+        movq    t2, 24(z)
+
+// Result term 4
+
+        xorq    t2, t2
+        combadz(t2,t1,t0,8(x),24(y))
+        combadd(t2,t1,t0,16(x),16(y))
+        combadd(t2,t1,t0,24(x),8(y))
+        movq    t0, 32(z)
+
+// Result term 5
+
+        xorq    t0, t0
+        combadz(t0,t2,t1,16(x),24(y))
+        combadd(t0,t2,t1,24(x),16(y))
+        movq    t1, 40(z)
+
+// Result term 6
+
+        xorq    t1, t1
+        combads(t0,t2,24(x),24(y))
+        movq    t2, 48(z)
+
+// Result term 7
+
+        movq    t0, 56(z)
+
+// Return
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12.S
new file mode 100644
index 00000000000..87dbfa09d4e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12.S
@@ -0,0 +1,210 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12
+//     (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// Copied in or set up
+
+#define y %rcx
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// Add in x[i] * %rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+
+.macro mulpadd arg1,arg2
+        mulxq   8*\arg2(x), %rax, %rbx
+.if ((\arg1 + \arg2) % 6 == 0)
+        adcxq   %rax, %r8
+        adoxq   %rbx, %r9
+.elseif ((\arg1 + \arg2) % 6 == 1)
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+.elseif ((\arg1 + \arg2) % 6 == 2)
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+.elseif ((\arg1 + \arg2) % 6 == 3)
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+.elseif ((\arg1 + \arg2) % 6 == 4)
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+.elseif ((\arg1 + \arg2) % 6 == 5)
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r8
+.endif
+
+.endm
+
+
+// Add in the whole j'th row
+
+.macro addrow arg1
+        movq    8*\arg1(y), %rdx
+        xorl    zeroe, zeroe
+
+        mulpadd \arg1, 0
+
+.if (\arg1 % 6 == 0)
+        movq    %r8, 8*\arg1(z)
+.elseif (\arg1 % 6 == 1)
+        movq    %r9, 8*\arg1(z)
+.elseif (\arg1 % 6 == 2)
+        movq    %r10, 8*\arg1(z)
+.elseif (\arg1 % 6 == 3)
+        movq    %r11, 8*\arg1(z)
+.elseif (\arg1 % 6 == 4)
+        movq    %r12, 8*\arg1(z)
+.elseif (\arg1 % 6 == 5)
+        movq    %r13, 8*\arg1(z)
+.endif
+
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+        mulpadd \arg1, 3
+        mulpadd \arg1, 4
+
+.if (\arg1 % 6 == 0)
+        mulxq   40(x), %rax, %r8
+        adcxq   %rax, %r13
+        adoxq   zero, %r8
+        adcxq   zero, %r8
+.elseif (\arg1 % 6 == 1)
+        mulxq   40(x), %rax, %r9
+        adcxq   %rax, %r8
+        adoxq   zero, %r9
+        adcxq   zero, %r9
+.elseif (\arg1 % 6 == 2)
+        mulxq   40(x), %rax, %r10
+        adcxq   %rax, %r9
+        adoxq   zero, %r10
+        adcxq   zero, %r10
+.elseif (\arg1 % 6 == 3)
+        mulxq   40(x), %rax, %r11
+        adcxq   %rax, %r10
+        adoxq   zero, %r11
+        adcxq   zero, %r11
+.elseif (\arg1 % 6 == 4)
+        mulxq   40(x), %rax, %r12
+        adcxq   %rax, %r11
+        adoxq   zero, %r12
+        adcxq   zero, %r12
+.elseif (\arg1 % 6 == 5)
+        mulxq   40(x), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   zero, %r13
+        adcxq   zero, %r13
+.endif
+
+.endm
+
+
+
+S2N_BN_SYMBOL(bignum_mul_6_12):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbp
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Zero a register, which also makes sure we don't get a fake carry-in
+
+        xorl    zeroe, zeroe
+
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// %r8,%r13,%r12,%r11,%r10,%r9 as y[0] * x from 1..6
+
+        movq    (y), %rdx
+
+        mulxq   (x), %r8, %r9
+        movq    %r8, (z)
+
+        mulxq   8(x), %rbx, %r10
+        adcxq   %rbx, %r9
+
+        mulxq   16(x), %rbx, %r11
+        adcxq   %rbx, %r10
+
+        mulxq   24(x), %rbx, %r12
+        adcxq   %rbx, %r11
+
+        mulxq   32(x), %rbx, %r13
+        adcxq   %rbx, %r12
+
+        mulxq   40(x), %rbx, %r8
+        adcxq   %rbx, %r13
+        adcxq   zero, %r8
+
+// Now all the other rows in a uniform pattern
+
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+
+// Now write back the additional columns
+
+        movq    %r8, 48(z)
+        movq    %r9, 56(z)
+        movq    %r10, 64(z)
+        movq    %r11, 72(z)
+        movq    %r12, 80(z)
+        movq    %r13, 88(z)
+
+// Restore registers and return
+
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12_alt.S
new file mode 100644
index 00000000000..36bceceb536
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12_alt.S
@@ -0,0 +1,185 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12_alt
+//     (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// This is moved from %rdx to free it for muls
+
+#define y %rcx
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 %r8
+#define t1 %r9
+#define t2 %r10
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+S2N_BN_SYMBOL(bignum_mul_6_12_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Result term 0
+
+        movq    (x), %rax
+        mulq     (y)
+
+        movq    %rax, (z)
+        movq    %rdx, t0
+        xorq    t1, t1
+
+// Result term 1
+
+        xorq    t2, t2
+        combads(t1,t0,(x),8(y))
+        combadz(t2,t1,t0,8(x),(y))
+        movq    t0, 8(z)
+
+// Result term 2
+
+        xorq    t0, t0
+        combadz(t0,t2,t1,(x),16(y))
+        combadd(t0,t2,t1,8(x),8(y))
+        combadd(t0,t2,t1,16(x),(y))
+        movq    t1, 16(z)
+
+// Result term 3
+
+        xorq    t1, t1
+        combadz(t1,t0,t2,(x),24(y))
+        combadd(t1,t0,t2,8(x),16(y))
+        combadd(t1,t0,t2,16(x),8(y))
+        combadd(t1,t0,t2,24(x),(y))
+        movq    t2, 24(z)
+
+// Result term 4
+
+        xorq    t2, t2
+        combadz(t2,t1,t0,(x),32(y))
+        combadd(t2,t1,t0,8(x),24(y))
+        combadd(t2,t1,t0,16(x),16(y))
+        combadd(t2,t1,t0,24(x),8(y))
+        combadd(t2,t1,t0,32(x),(y))
+        movq    t0, 32(z)
+
+// Result term 5
+
+        xorq    t0, t0
+        combadz(t0,t2,t1,(x),40(y))
+        combadd(t0,t2,t1,8(x),32(y))
+        combadd(t0,t2,t1,16(x),24(y))
+        combadd(t0,t2,t1,24(x),16(y))
+        combadd(t0,t2,t1,32(x),8(y))
+        combadd(t0,t2,t1,40(x),(y))
+        movq    t1, 40(z)
+
+// Result term 6
+
+        xorq    t1, t1
+        combadz(t1,t0,t2,8(x),40(y))
+        combadd(t1,t0,t2,16(x),32(y))
+        combadd(t1,t0,t2,24(x),24(y))
+        combadd(t1,t0,t2,32(x),16(y))
+        combadd(t1,t0,t2,40(x),8(y))
+        movq    t2, 48(z)
+
+// Result term 7
+
+        xorq    t2, t2
+        combadz(t2,t1,t0,16(x),40(y))
+        combadd(t2,t1,t0,24(x),32(y))
+        combadd(t2,t1,t0,32(x),24(y))
+        combadd(t2,t1,t0,40(x),16(y))
+        movq    t0, 56(z)
+
+// Result term 8
+
+        xorq    t0, t0
+        combadz(t0,t2,t1,24(x),40(y))
+        combadd(t0,t2,t1,32(x),32(y))
+        combadd(t0,t2,t1,40(x),24(y))
+        movq    t1, 64(z)
+
+// Result term 9
+
+        xorq    t1, t1
+        combadz(t1,t0,t2,32(x),40(y))
+        combadd(t1,t0,t2,40(x),32(y))
+        movq    t2, 72(z)
+
+// Result term 10
+
+        combads(t1,t0,40(x),40(y))
+        movq    t0, 80(z)
+
+// Result term 11
+
+        movq    t1, 88(z)
+
+// Return
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16.S
new file mode 100644
index 00000000000..598fccd51d5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16.S
@@ -0,0 +1,260 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16
+//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// Copied in or set up
+
+#define y %rcx
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+
+.macro mulpadd arg1,arg2
+        mulxq   8*\arg1(x), %rax, %rbx
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcxq   %rax, %r8
+        adoxq   %rbx, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcxq   %rax, %r15
+        adoxq   %rbx, %r8
+.endif
+
+.endm
+
+// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j
+// but re-creates the top word assuming nothing to add there
+
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulxq   8*\arg1(x), %rax, %r9
+        adcxq   %rax, %r8
+        adoxq   zero, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulxq   8*\arg1(x), %rax, %r10
+        adcxq   %rax, %r9
+        adoxq   zero, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulxq   8*\arg1(x), %rax, %r11
+        adcxq   %rax, %r10
+        adoxq   zero, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulxq   8*\arg1(x), %rax, %r12
+        adcxq   %rax, %r11
+        adoxq   zero, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulxq   8*\arg1(x), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   zero, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulxq   8*\arg1(x), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   zero, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulxq   8*\arg1(x), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   zero, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulxq   8*\arg1(x), %rax, %r8
+        adcxq   %rax, %r15
+        adoxq   zero, %r8
+.endif
+
+.endm
+
+// Add in the whole j'th row
+
+.macro addrow arg1
+        movq    8*\arg1(y), %rdx
+        xorl    zeroe, zeroe
+
+        mulpadd 0, \arg1
+
+.if (\arg1 % 8 == 0)
+        movq    %r8, 8*\arg1(z)
+.elseif (\arg1 % 8 == 1)
+        movq    %r9, 8*\arg1(z)
+.elseif (\arg1 % 8 == 2)
+        movq    %r10, 8*\arg1(z)
+.elseif (\arg1 % 8 == 3)
+        movq    %r11, 8*\arg1(z)
+.elseif (\arg1 % 8 == 4)
+        movq    %r12, 8*\arg1(z)
+.elseif (\arg1 % 8 == 5)
+        movq    %r13, 8*\arg1(z)
+.elseif (\arg1 % 8 == 6)
+        movq    %r14, 8*\arg1(z)
+.elseif (\arg1 % 8 == 7)
+        movq    %r15, 8*\arg1(z)
+.endif
+
+        mulpadd 1, \arg1
+        mulpadd 2, \arg1
+        mulpadd 3, \arg1
+        mulpadd 4, \arg1
+        mulpadd 5, \arg1
+        mulpadd 6, \arg1
+        mulpade 7, \arg1
+
+.if (\arg1 % 8 == 0)
+        adcq    zero, %r8
+.elseif (\arg1 % 8 == 1)
+        adcq    zero, %r9
+.elseif (\arg1 % 8 == 2)
+        adcq    zero, %r10
+.elseif (\arg1 % 8 == 3)
+        adcq    zero, %r11
+.elseif (\arg1 % 8 == 4)
+        adcq    zero, %r12
+.elseif (\arg1 % 8 == 5)
+        adcq    zero, %r13
+.elseif (\arg1 % 8 == 6)
+        adcq    zero, %r14
+.elseif (\arg1 % 8 == 7)
+        adcq    zero, %r15
+.endif
+
+.endm
+
+
+S2N_BN_SYMBOL(bignum_mul_8_16):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbp
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Zero a register, which also makes sure we don't get a fake carry-in
+
+        xorl    zeroe, zeroe
+
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// %r8,%r15,%r14,%r13,%r12,%r11,%r10,%r9 as y[0] * x from 1..8
+
+        movq    (y), %rdx
+
+        mulxq   (x), %r8, %r9
+        movq    %r8, (z)
+
+        mulxq   8(x), %rbx, %r10
+        adcq    %rbx, %r9
+
+        mulxq   16(x), %rbx, %r11
+        adcq    %rbx, %r10
+
+        mulxq   24(x), %rbx, %r12
+        adcq    %rbx, %r11
+
+        mulxq   32(x), %rbx, %r13
+        adcq    %rbx, %r12
+
+        mulxq   40(x), %rbx, %r14
+        adcq    %rbx, %r13
+
+        mulxq   48(x), %rbx, %r15
+        adcq    %rbx, %r14
+
+        mulxq   56(x), %rbx, %r8
+        adcq    %rbx, %r15
+        adcq    zero, %r8
+
+// Now all the other rows in a uniform pattern
+
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+        addrow  6
+        addrow  7
+
+// Now write back the additional columns
+
+        movq    %r8, 64(z)
+        movq    %r9, 72(z)
+        movq    %r10, 80(z)
+        movq    %r11, 88(z)
+        movq    %r12, 96(z)
+        movq    %r13, 104(z)
+        movq    %r14, 112(z)
+        movq    %r15, 120(z)
+
+// Real epilog
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16_alt.S
new file mode 100644
index 00000000000..a1a2a67e714
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16_alt.S
@@ -0,0 +1,233 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+//    extern void bignum_mul_8_16_alt
+//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// This is moved from %rdx to free it for muls
+
+#define y %rcx
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 %r8
+#define t1 %r9
+#define t2 %r10
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+S2N_BN_SYMBOL(bignum_mul_8_16_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Result term 0
+
+        movq    (x), %rax
+        mulq     (y)
+
+        movq    %rax, (z)
+        movq    %rdx, t0
+        xorq    t1, t1
+
+// Result term 1
+
+        xorq    t2, t2
+        combads(t1,t0,(x),8(y))
+        combadz(t2,t1,t0,8(x),(y))
+        movq    t0, 8(z)
+
+// Result term 2
+
+        xorq    t0, t0
+        combadz(t0,t2,t1,(x),16(y))
+        combadd(t0,t2,t1,8(x),8(y))
+        combadd(t0,t2,t1,16(x),(y))
+        movq    t1, 16(z)
+
+// Result term 3
+
+        xorq    t1, t1
+        combadz(t1,t0,t2,(x),24(y))
+        combadd(t1,t0,t2,8(x),16(y))
+        combadd(t1,t0,t2,16(x),8(y))
+        combadd(t1,t0,t2,24(x),(y))
+        movq    t2, 24(z)
+
+// Result term 4
+
+        xorq    t2, t2
+        combadz(t2,t1,t0,(x),32(y))
+        combadd(t2,t1,t0,8(x),24(y))
+        combadd(t2,t1,t0,16(x),16(y))
+        combadd(t2,t1,t0,24(x),8(y))
+        combadd(t2,t1,t0,32(x),(y))
+        movq    t0, 32(z)
+
+// Result term 5
+
+        xorq    t0, t0
+        combadz(t0,t2,t1,(x),40(y))
+        combadd(t0,t2,t1,8(x),32(y))
+        combadd(t0,t2,t1,16(x),24(y))
+        combadd(t0,t2,t1,24(x),16(y))
+        combadd(t0,t2,t1,32(x),8(y))
+        combadd(t0,t2,t1,40(x),(y))
+        movq    t1, 40(z)
+
+// Result term 6
+
+        xorq    t1, t1
+        combadz(t1,t0,t2,(x),48(y))
+        combadd(t1,t0,t2,8(x),40(y))
+        combadd(t1,t0,t2,16(x),32(y))
+        combadd(t1,t0,t2,24(x),24(y))
+        combadd(t1,t0,t2,32(x),16(y))
+        combadd(t1,t0,t2,40(x),8(y))
+        combadd(t1,t0,t2,48(x),(y))
+        movq    t2, 48(z)
+
+// Result term 7
+
+        xorq    t2, t2
+        combadz(t2,t1,t0,(x),56(y))
+        combadd(t2,t1,t0,8(x),48(y))
+        combadd(t2,t1,t0,16(x),40(y))
+        combadd(t2,t1,t0,24(x),32(y))
+        combadd(t2,t1,t0,32(x),24(y))
+        combadd(t2,t1,t0,40(x),16(y))
+        combadd(t2,t1,t0,48(x),8(y))
+        combadd(t2,t1,t0,56(x),(y))
+        movq    t0, 56(z)
+
+// Result term 8
+
+        xorq    t0, t0
+        combadz(t0,t2,t1,8(x),56(y))
+        combadd(t0,t2,t1,16(x),48(y))
+        combadd(t0,t2,t1,24(x),40(y))
+        combadd(t0,t2,t1,32(x),32(y))
+        combadd(t0,t2,t1,40(x),24(y))
+        combadd(t0,t2,t1,48(x),16(y))
+        combadd(t0,t2,t1,56(x),8(y))
+        movq    t1, 64(z)
+
+// Result term 9
+
+        xorq    t1, t1
+        combadz(t1,t0,t2,16(x),56(y))
+        combadd(t1,t0,t2,24(x),48(y))
+        combadd(t1,t0,t2,32(x),40(y))
+        combadd(t1,t0,t2,40(x),32(y))
+        combadd(t1,t0,t2,48(x),24(y))
+        combadd(t1,t0,t2,56(x),16(y))
+        movq    t2, 72(z)
+
+// Result term 10
+
+        xorq    t2, t2
+        combadz(t2,t1,t0,24(x),56(y))
+        combadd(t2,t1,t0,32(x),48(y))
+        combadd(t2,t1,t0,40(x),40(y))
+        combadd(t2,t1,t0,48(x),32(y))
+        combadd(t2,t1,t0,56(x),24(y))
+        movq    t0, 80(z)
+
+// Result term 11
+
+        xorq    t0, t0
+        combadz(t0,t2,t1,32(x),56(y))
+        combadd(t0,t2,t1,40(x),48(y))
+        combadd(t0,t2,t1,48(x),40(y))
+        combadd(t0,t2,t1,56(x),32(y))
+        movq    t1, 88(z)
+
+// Result term 12
+
+        xorq    t1, t1
+        combadz(t1,t0,t2,40(x),56(y))
+        combadd(t1,t0,t2,48(x),48(y))
+        combadd(t1,t0,t2,56(x),40(y))
+        movq    t2, 96(z)
+
+// Result term 13
+
+        xorq    t2, t2
+        combadz(t2,t1,t0,48(x),56(y))
+        combadd(t2,t1,t0,56(x),48(y))
+        movq    t0, 104(z)
+
+// Result term 14
+
+        combads(t2,t1,56(x),56(y))
+        movq    t1, 112(z)
+
+// Result term 11
+
+        movq    t2, 120(z)
+
+// Return
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8.S
new file mode 100644
index 00000000000..4b19675569b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8.S
@@ -0,0 +1,145 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+//    extern void bignum_sqr_4_8 (uint64_t z[static 8], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// Other registers
+
+#define d1 %r8
+#define d2 %r9
+#define d3 %r10
+#define d4 %r11
+#define d5 %r12
+#define d6 %r13
+
+
+
+S2N_BN_SYMBOL(bignum_sqr_4_8):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+
+// Set up an initial window [d6;...d1] = [23;03;01]
+
+        movq    (x), %rdx
+        mulxq   8(x), d1, d2
+        mulxq   24(x), d3, d4
+        movq    16(x), %rdx
+        mulxq   24(x), d5, d6
+
+// Clear our zero register, and also initialize the flags for the carry chain
+
+        xorl    zeroe, zeroe
+
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        mulxq   (x), %rax, %rcx
+        adcxq   %rax, d2
+        adoxq   %rcx, d3
+        mulxq   8(x), %rax, %rcx
+        adcxq   %rax, d3
+        adoxq   %rcx, d4
+        movq    24(x), %rdx
+        mulxq   8(x), %rax, %rcx
+        adcxq   %rax, d4
+        adoxq   %rcx, d5
+        adcxq   zero, d5
+        adoxq   zero, d6
+        adcxq   zero, d6
+
+// In principle this is otiose as CF and OF carries are absorbed at this point
+// However it seems helpful for the OOO engine to be told it's a fresh start
+
+        xorl    zeroe, zeroe
+
+// Double and add to the 00 + 11 + 22 + 33 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+
+        movq    (x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        movq    %rax, (z)
+        adcxq   d1, d1
+        adoxq   %rdx, d1
+        movq    8(x), %rdx
+        movq    d1, 8(z)
+        mulxq   %rdx, %rax, %rdx
+        adcxq   d2, d2
+        adoxq   %rax, d2
+        adcxq   d3, d3
+        adoxq   %rdx, d3
+        movq    16(x), %rdx
+        movq    d2, 16(z)
+        mulxq   %rdx, %rax, %rdx
+        adcxq   d4, d4
+        adoxq   %rax, d4
+        adcxq   d5, d5
+        adoxq   %rdx, d5
+        movq    24(x), %rdx
+        movq    d3, 24(z)
+        mulxq   %rdx, %rax, %rdx
+        movq    d4, 32(z)
+        adcxq   d6, d6
+        movq    d5, 40(z)
+        adoxq   %rax, d6
+        movq    d6, 48(z)
+        adcxq   zero, %rdx
+        adoxq   zero, %rdx
+        movq    %rdx, 56(z)
+
+// Restore saved registers and return
+
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8_alt.S
new file mode 100644
index 00000000000..693a57d74d1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8_alt.S
@@ -0,0 +1,134 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+//    extern void bignum_sqr_4_8_alt
+//      (uint64_t z[static 8], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt)
+        .text
+
+// Input arguments
+
+#define z %rdi
+#define x %rsi
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 %rcx
+#define t1 %r8
+#define t2 %r9
+
+// Macro for the key "multiply and add to (c,h,l)" step, for square term
+
+#define combadd1(c,h,l,numa)                    \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa)                       \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+// A version doubling before adding, for non-square terms
+
+#define combadd2(c,h,l,numa,numb)               \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0, c ;                           \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Result term 0
+
+        movq    (x), %rax
+        mulq    %rax
+
+        movq    %rax, (z)
+        movq    %rdx, t0
+        xorq    t1, t1
+
+// Result term 1
+
+       xorq    t2, t2
+       combadd2(t2,t1,t0,(x),8(x))
+       movq    t0, 8(z)
+
+// Result term 2
+
+        xorq    t0, t0
+        combadd1(t0,t2,t1,8(x))
+        combadd2(t0,t2,t1,(x),16(x))
+        movq    t1, 16(z)
+
+// Result term 3
+
+        xorq    t1, t1
+        combadd2(t1,t0,t2,(x),24(x))
+        combadd2(t1,t0,t2,8(x),16(x))
+        movq    t2, 24(z)
+
+// Result term 4
+
+        xorq    t2, t2
+        combadd2(t2,t1,t0,8(x),24(x))
+        combadd1(t2,t1,t0,16(x))
+        movq    t0, 32(z)
+
+// Result term 5
+
+        xorq    t0, t0
+        combadd2(t0,t2,t1,16(x),24(x))
+        movq    t1, 40(z)
+
+// Result term 6
+
+        xorq    t1, t1
+        combads(t0,t2,24(x))
+        movq    t2, 48(z)
+
+// Result term 7
+
+        movq    t0, 56(z)
+
+// Return
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12.S
new file mode 100644
index 00000000000..f0abc6480d3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12.S
@@ -0,0 +1,214 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12 (uint64_t z[static 12], uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// Other registers
+
+#define d1 %r8
+#define d2 %r9
+#define d3 %r10
+#define d4 %r11
+#define d5 %r12
+#define d6 %r13
+#define d7 %r14
+#define d8 %r15
+#define d9 %rbx
+
+// Care is needed: re-using the zero register
+
+#define d10 %rbp
+
+
+S2N_BN_SYMBOL(bignum_sqr_6_12):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbp
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Set up an initial window [d8;...d1] = [34;05;03;01]
+
+        movq    (x), %rdx
+        mulxq   8(x), d1, d2
+        mulxq   24(x), d3, d4
+        mulxq   40(x), d5, d6
+        movq    24(x), %rdx
+        mulxq   32(x), d7, d8
+
+// Clear our zero register, and also initialize the flags for the carry chain
+
+        xorl    zeroe, zeroe
+
+// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
+// (no carry-out possible since we add it to the top of a product)
+
+        movq    16(x), %rdx
+        mulxq   (x), %rax, %rcx
+        adcxq   %rax, d2
+        adoxq   %rcx, d3
+        mulxq   8(x), %rax, %rcx
+        adcxq   %rax, d3
+        adoxq   %rcx, d4
+        movq    8(x), %rdx
+        mulxq   24(x), %rax, %rcx
+        adcxq   %rax, d4
+        adoxq   %rcx, d5
+        mulxq   32(x), %rax, %rcx
+        adcxq   %rax, d5
+        adoxq   %rcx, d6
+        mulxq   40(x), %rax, %rcx
+        adcxq   %rax, d6
+        adoxq   %rcx, d7
+        adcxq   zero, d7
+        adoxq   zero, d8
+        adcxq   zero, d8
+
+// Again zero out the flags. Actually they are already cleared but it may
+// help decouple these in the OOO engine not to wait for the chain above
+
+        xorl    zeroe, zeroe
+
+// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
+// We are running out of registers and here our zero register is not zero!
+
+        movq    32(x), %rdx
+        mulxq   (x), %rax, %rcx
+        adcxq   %rax, d4
+        adoxq   %rcx, d5
+        movq    16(x), %rdx
+        mulxq   24(x), %rax, %rcx
+        adcxq   %rax, d5
+        adoxq   %rcx, d6
+        mulxq   32(x), %rax, %rcx
+        adcxq   %rax, d6
+        adoxq   %rcx, d7
+        mulxq   40(x), %rax, %rcx
+        adcxq   %rax, d7
+        adoxq   %rcx, d8
+        movq    24(x), %rdx
+        mulxq   40(x), %rax, d9
+        adcxq   %rax, d8
+        adoxq   zero, d9
+        movq    32(x), %rdx
+        mulxq   40(x), %rax, d10
+        adcxq   %rax, d9
+        movl    $0, %eax
+        adoxq   %rax, d10
+        adcxq   %rax, d10
+
+// Again, just for a clear fresh start for the flags
+
+        xorl    %eax, %eax
+
+// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+
+        movq    (x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        movq    %rax, (z)
+        adcxq   d1, d1
+        adoxq   %rdx, d1
+        movq    8(x), %rdx
+        movq    d1, 8(z)
+        mulxq   %rdx, %rax, %rdx
+        adcxq   d2, d2
+        adoxq   %rax, d2
+        adcxq   d3, d3
+        adoxq   %rdx, d3
+        movq    16(x), %rdx
+        movq    d2, 16(z)
+        mulxq   %rdx, %rax, %rdx
+        adcxq   d4, d4
+        adoxq   %rax, d4
+        adcxq   d5, d5
+        adoxq   %rdx, d5
+        movq    24(x), %rdx
+        movq    d3, 24(z)
+        mulxq   %rdx, %rax, %rdx
+        adcxq   d6, d6
+        adoxq   %rax, d6
+        adcxq   d7, d7
+        adoxq   %rdx, d7
+        movq    32(x), %rdx
+        movq    d4, 32(z)
+        mulxq   %rdx, %rax, %rdx
+        adcxq   d8, d8
+        adoxq   %rax, d8
+        adcxq   d9, d9
+        adoxq   %rdx, d9
+        movq    40(x), %rdx
+        movq    d5, 40(z)
+        mulxq   %rdx, %rax, %rdx
+        movq    d6, 48(z)
+        adcxq   d10, d10
+        movq    d7, 56(z)
+        adoxq   %rax, d10
+        movq    d8, 64(z)
+        movl    $0, %eax
+        movq    d9, 72(z)
+        adcxq   %rax, %rdx
+        movq    d10, 80(z)
+        adoxq   %rax, %rdx
+        movq    %rdx, 88(z)
+
+// Restore saved registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12_alt.S
new file mode 100644
index 00000000000..f576b42e165
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12_alt.S
@@ -0,0 +1,196 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12_alt (uint64_t z[static 12], uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
+        .text
+
+// Input arguments
+
+#define z %rdi
+#define x %rsi
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 %r8
+#define t1 %r9
+#define t2 %r10
+
+// Additional temporaries for local windows to share doublings
+
+#define u0 %rcx
+#define u1 %r11
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// Set up initial window (c,h,l) = numa * numb
+
+#define combaddz(c,h,l,numa,numb)               \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        xorq    c, c ;                           \
+        movq    %rax, l ;                         \
+        movq    %rdx, h
+
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+
+#define doubladd(c,h,l,hh,ll)                   \
+        addq    ll, ll ;                         \
+        adcq    hh, hh ;                         \
+        adcq    c, c ;                           \
+        addq    ll, l ;                          \
+        adcq    hh, h ;                          \
+        adcq    $0, c
+
+// Square term incorporation (c,h,l) += numba^2
+
+#define combadd1(c,h,l,numa)                    \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa)                       \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+// A version doubling directly before adding, for single non-square terms
+
+#define combadd2(c,h,l,numa,numb)               \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0, c ;                           \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Result term 0
+
+        movq    (x), %rax
+        mulq    %rax
+
+        movq    %rax, (z)
+        movq    %rdx, t0
+        xorq    t1, t1
+
+// Result term 1
+
+        xorq    t2, t2
+        combadd2(t2,t1,t0,(x),8(x))
+        movq    t0, 8(z)
+
+// Result term 2
+
+        xorq    t0, t0
+        combadd1(t0,t2,t1,8(x))
+        combadd2(t0,t2,t1,(x),16(x))
+        movq    t1, 16(z)
+
+// Result term 3
+
+        combaddz(t1,u1,u0,(x),24(x))
+        combadd(t1,u1,u0,8(x),16(x))
+        doubladd(t1,t0,t2,u1,u0)
+        movq    t2, 24(z)
+
+// Result term 4
+
+        combaddz(t2,u1,u0,(x),32(x))
+        combadd(t2,u1,u0,8(x),24(x))
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,16(x))
+        movq    t0, 32(z)
+
+// Result term 5
+
+        combaddz(t0,u1,u0,(x),40(x))
+        combadd(t0,u1,u0,8(x),32(x))
+        combadd(t0,u1,u0,16(x),24(x))
+        doubladd(t0,t2,t1,u1,u0)
+        movq    t1, 40(z)
+
+// Result term 6
+
+        combaddz(t1,u1,u0,8(x),40(x))
+        combadd(t1,u1,u0,16(x),32(x))
+        doubladd(t1,t0,t2,u1,u0)
+        combadd1(t1,t0,t2,24(x))
+        movq    t2, 48(z)
+
+// Result term 7
+
+        combaddz(t2,u1,u0,16(x),40(x))
+        combadd(t2,u1,u0,24(x),32(x))
+        doubladd(t2,t1,t0,u1,u0)
+        movq    t0, 56(z)
+
+// Result term 8
+
+        xorq    t0, t0
+        combadd2(t0,t2,t1,24(x),40(x))
+        combadd1(t0,t2,t1,32(x))
+        movq    t1, 64(z)
+
+// Result term 9
+
+        xorq    t1, t1
+        combadd2(t1,t0,t2,32(x),40(x))
+        movq    t2, 72(z)
+
+// Result term 10
+
+        combads(t1,t0,40(x))
+        movq    t0, 80(z)
+
+// Result term 11
+
+        movq    t1, 88(z)
+
+// Return
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16.S
new file mode 100644
index 00000000000..b90101c0887
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16.S
@@ -0,0 +1,298 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16 (uint64_t z[static 16], uint64_t x[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// mulpadd i, j adds rdx * x[i] into the window  at the i+j point
+
+.macro mulpadd arg1,arg2
+        mulxq   8*\arg1(x), %rax, %rcx
+.if ((\arg1 + \arg2) % 8 == 0)
+        adcxq   %rax, %r8
+        adoxq   %rcx, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        adcxq   %rax, %r11
+        adoxq   %rcx, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        adcxq   %rax, %r15
+        adoxq   %rcx, %r8
+.endif
+
+.endm
+
+// mulpade i, j adds rdx * x[i] into the window at i+j
+// but re-creates the top word assuming nothing to add there
+
+.macro mulpade arg1,arg2
+.if ((\arg1 + \arg2) % 8 == 0)
+        mulxq   8*\arg1(x), %rax, %r9
+        adcxq   %rax, %r8
+        adoxq   zero, %r9
+.elseif ((\arg1 + \arg2) % 8 == 1)
+        mulxq   8*\arg1(x), %rax, %r10
+        adcxq   %rax, %r9
+        adoxq   zero, %r10
+.elseif ((\arg1 + \arg2) % 8 == 2)
+        mulxq   8*\arg1(x), %rax, %r11
+        adcxq   %rax, %r10
+        adoxq   zero, %r11
+.elseif ((\arg1 + \arg2) % 8 == 3)
+        mulxq   8*\arg1(x), %rax, %r12
+        adcxq   %rax, %r11
+        adoxq   zero, %r12
+.elseif ((\arg1 + \arg2) % 8 == 4)
+        mulxq   8*\arg1(x), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   zero, %r13
+.elseif ((\arg1 + \arg2) % 8 == 5)
+        mulxq   8*\arg1(x), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   zero, %r14
+.elseif ((\arg1 + \arg2) % 8 == 6)
+        mulxq   8*\arg1(x), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   zero, %r15
+.elseif ((\arg1 + \arg2) % 8 == 7)
+        mulxq   8*\arg1(x), %rax, %r8
+        adcxq   %rax, %r15
+        adoxq   zero, %r8
+.endif
+
+.endm
+
+.macro diagonals
+
+        xorl    zeroe, zeroe
+
+// Set initial window [%r8..%r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70
+
+        movq    (x), %rdx
+        mulxq   8(x), %r9, %rax
+        movq    %r9, 8(z)
+        mulxq   16(x), %r10, %rcx
+        adcxq   %rax, %r10
+        movq    %r10, 16(z)
+        mulxq   24(x), %r11, %rax
+        adcxq   %rcx, %r11
+        mulxq   32(x), %r12, %rcx
+        adcxq   %rax, %r12
+        mulxq   40(x), %r13, %rax
+        adcxq   %rcx, %r13
+        mulxq   48(x), %r14, %rcx
+        adcxq   %rax, %r14
+        mulxq   56(x), %r15, %r8
+        adcxq   %rcx, %r15
+        adcxq   zero, %r8
+
+// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54
+
+        xorl    zeroe, zeroe
+        movq    8(x), %rdx
+        mulpadd 2, 1
+        movq    %r11, 24(z)
+        mulpadd 3, 1
+        movq    %r12, 32(z)
+        mulpadd 4, 1
+        mulpadd 5, 1
+        mulpadd 6, 1
+        mulpade 7, 1
+        movq    32(x), %rdx
+        mulpade 5, 4
+        adcxq   zero, %r10
+
+// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65
+
+        xorl    zeroe, zeroe
+        movq    16(x), %rdx
+        mulpadd 3, 2
+        movq    %r13, 40(z)
+        mulpadd 4, 2
+        movq    %r14, 48(z)
+        mulpadd 5, 2
+        mulpadd 6, 2
+        mulpadd 7, 2
+        movq    48(x), %rdx
+        mulpade 4, 6
+        mulpade 5, 6
+        adcxq   zero, %r12
+
+// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76
+
+        xorl    zeroe, zeroe
+        movq    24(x), %rdx
+        mulpadd 4, 3
+        movq    %r15, 56(z)
+        mulpadd 5, 3
+        movq    %r8, 64(z)
+        mulpadd 6, 3
+        mulpadd 7, 3
+        movq    56(x), %rdx
+        mulpadd 4, 7
+        mulpade 5, 7
+        mulpade 6, 7
+        adcxq   zero, %r14
+
+// Double and add things; use z[1]..z[8] and thereafter the registers
+// %r9..%r15 which haven't been written back yet
+
+        xorl    zeroe, zeroe
+        movq    (x), %rdx
+        mulxq   %rdx, %rax, %rcx
+        movq    %rax, (z)
+        movq    8(z), %rax
+        adcxq   %rax, %rax
+        adoxq   %rcx, %rax
+        movq    %rax, 8(z)
+
+        movq    16(z), %rax
+        movq    8(x), %rdx
+        mulxq   %rdx, %rdx, %rcx
+        adcxq   %rax, %rax
+        adoxq   %rdx, %rax
+        movq    %rax, 16(z)
+        movq    24(z), %rax
+        adcxq   %rax, %rax
+        adoxq   %rcx, %rax
+        movq    %rax, 24(z)
+
+        movq    32(z), %rax
+        movq    16(x), %rdx
+        mulxq   %rdx, %rdx, %rcx
+        adcxq   %rax, %rax
+        adoxq   %rdx, %rax
+        movq    %rax, 32(z)
+        movq    40(z), %rax
+        adcxq   %rax, %rax
+        adoxq   %rcx, %rax
+        movq    %rax, 40(z)
+
+        movq    48(z), %rax
+        movq    24(x), %rdx
+        mulxq   %rdx, %rdx, %rcx
+        adcxq   %rax, %rax
+        adoxq   %rdx, %rax
+        movq    %rax, 48(z)
+        movq    56(z), %rax
+        adcxq   %rax, %rax
+        adoxq   %rcx, %rax
+        movq    %rax, 56(z)
+
+        movq    64(z), %rax
+        movq    32(x), %rdx
+        mulxq   %rdx, %rdx, %rcx
+        adcxq   %rax, %rax
+        adoxq   %rdx, %rax
+        movq    %rax, 64(z)
+        adcxq   %r9, %r9
+        adoxq   %rcx, %r9
+        movq    %r9, 72(z)
+
+        movq    40(x), %rdx
+        mulxq   %rdx, %rdx, %rcx
+        adcxq   %r10, %r10
+        adoxq   %rdx, %r10
+        movq    %r10, 80(z)
+        adcxq   %r11, %r11
+        adoxq   %rcx, %r11
+        movq    %r11, 88(z)
+
+        movq    48(x), %rdx
+        mulxq   %rdx, %rdx, %rcx
+        adcxq   %r12, %r12
+        adoxq   %rdx, %r12
+        movq    %r12, 96(z)
+        adcxq   %r13, %r13
+        adoxq   %rcx, %r13
+        movq    %r13, 104(z)
+
+        movq    56(x), %rdx
+        mulxq   %rdx, %rdx, %r15
+        adcxq   %r14, %r14
+        adoxq   %rdx, %r14
+        movq    %r14, 112(z)
+        adcxq   zero, %r15
+        adoxq   zero, %r15
+        movq    %r15, 120(z)
+
+.endm
+
+
+S2N_BN_SYMBOL(bignum_sqr_8_16):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Do the multiplication
+
+        diagonals
+
+// Real epilog
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16_alt.S
new file mode 100644
index 00000000000..2991033f49d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16_alt.S
@@ -0,0 +1,231 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+//    extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt)
+        .text
+
+// Input arguments
+
+#define z %rdi
+#define x %rsi
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 %r8
+#define t1 %r9
+#define t2 %r10
+
+// Additional temporaries for local windows to share doublings
+
+#define u0 %rcx
+#define u1 %r11
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// Set up initial window (c,h,l) = numa * numb
+
+#define combaddz(c,h,l,numa,numb)               \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        xorq    c, c ;                           \
+        movq    %rax, l ;                         \
+        movq    %rdx, h
+
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+
+#define doubladd(c,h,l,hh,ll)                   \
+        addq    ll, ll ;                         \
+        adcq    hh, hh ;                         \
+        adcq    c, c ;                           \
+        addq    ll, l ;                          \
+        adcq    hh, h ;                          \
+        adcq    $0, c
+
+// Square term incorporation (c,h,l) += numba^2
+
+#define combadd1(c,h,l,numa)                    \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa)                       \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+// A version doubling directly before adding, for single non-square terms
+
+#define combadd2(c,h,l,numa,numb)               \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0, c ;                           \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Result term 0
+
+        movq    (x), %rax
+        mulq    %rax
+
+        movq    %rax, (z)
+        movq    %rdx, t0
+        xorq    t1, t1
+
+// Result term 1
+
+        xorq    t2, t2
+        combadd2(t2,t1,t0,(x),8(x))
+        movq    t0, 8(z)
+
+// Result term 2
+
+        xorq    t0, t0
+        combadd1(t0,t2,t1,8(x))
+        combadd2(t0,t2,t1,(x),16(x))
+        movq    t1, 16(z)
+
+// Result term 3
+
+        combaddz(t1,u1,u0,(x),24(x))
+        combadd(t1,u1,u0,8(x),16(x))
+        doubladd(t1,t0,t2,u1,u0)
+        movq    t2, 24(z)
+
+// Result term 4
+
+        combaddz(t2,u1,u0,(x),32(x))
+        combadd(t2,u1,u0,8(x),24(x))
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,16(x))
+        movq    t0, 32(z)
+
+// Result term 5
+
+        combaddz(t0,u1,u0,(x),40(x))
+        combadd(t0,u1,u0,8(x),32(x))
+        combadd(t0,u1,u0,16(x),24(x))
+        doubladd(t0,t2,t1,u1,u0)
+        movq    t1, 40(z)
+
+// Result term 6
+
+        combaddz(t1,u1,u0,(x),48(x))
+        combadd(t1,u1,u0,8(x),40(x))
+        combadd(t1,u1,u0,16(x),32(x))
+        doubladd(t1,t0,t2,u1,u0)
+        combadd1(t1,t0,t2,24(x))
+        movq    t2, 48(z)
+
+// Result term 7
+
+        combaddz(t2,u1,u0,(x),56(x))
+        combadd(t2,u1,u0,8(x),48(x))
+        combadd(t2,u1,u0,16(x),40(x))
+        combadd(t2,u1,u0,24(x),32(x))
+        doubladd(t2,t1,t0,u1,u0)
+        movq    t0, 56(z)
+
+// Result term 8
+
+        combaddz(t0,u1,u0,8(x),56(x))
+        combadd(t0,u1,u0,16(x),48(x))
+        combadd(t0,u1,u0,24(x),40(x))
+        doubladd(t0,t2,t1,u1,u0)
+        combadd1(t0,t2,t1,32(x))
+        movq    t1, 64(z)
+
+// Result term 9
+
+        combaddz(t1,u1,u0,16(x),56(x))
+        combadd(t1,u1,u0,24(x),48(x))
+        combadd(t1,u1,u0,32(x),40(x))
+        doubladd(t1,t0,t2,u1,u0)
+        movq    t2, 72(z)
+
+// Result term 10
+
+        combaddz(t2,u1,u0,24(x),56(x))
+        combadd(t2,u1,u0,32(x),48(x))
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,40(x))
+        movq    t0, 80(z)
+
+// Result term 11
+
+        combaddz(t0,u1,u0,32(x),56(x))
+        combadd(t0,u1,u0,40(x),48(x))
+        doubladd(t0,t2,t1,u1,u0)
+        movq    t1, 88(z)
+
+// Result term 12
+
+        xorq    t1, t1
+        combadd2(t1,t0,t2,40(x),56(x))
+        combadd1(t1,t0,t2,48(x))
+        movq    t2, 96(z)
+
+// Result term 13
+
+        xorq    t2, t2
+        combadd2(t2,t1,t0,48(x),56(x))
+        movq    t0, 104(z)
+
+// Result term 14
+
+        combads(t2,t1,56(x))
+        movq    t1, 112(z)
+
+// Result term 15
+
+        movq    t2, 120(z)
+
+// Return
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_add.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_add.S
new file mode 100644
index 00000000000..58851be2b25
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_add.S
@@ -0,0 +1,154 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add, z := x + y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+//
+//    extern uint64_t bignum_add
+//     (uint64_t p, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the z := x + y operation, truncating modulo p words in general and
+// returning a top carry (0 or 1) in the p'th place, only adding the input
+// words below p (as well as m and n respectively) to get the sum and carry.
+//
+// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
+// Microsoft x64 ABI:   RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add)
+        .text
+
+#define p %rdi
+#define z %rsi
+#define m %rdx
+#define x %rcx
+#define n %r8
+#define y %r9
+#define i %r10
+#define a %rax
+
+#define ashort %eax
+
+
+
+S2N_BN_SYMBOL(bignum_add):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+        movq    64(%rsp), %r9
+#endif
+
+// Zero the main index counter for both branches
+
+        xorq    i, i
+
+// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
+// we'll never need words past the p'th. Can now assume m <= p and n <= p.
+// Then compare the modified m and n and branch accordingly
+
+        cmpq    m, p
+        cmovcq  p, m
+        cmpq    n, p
+        cmovcq  p, n
+        cmpq    n, m
+        jc      bignum_add_ylonger
+
+// The case where x is longer or of the same size (p >= m >= n)
+
+        subq    m, p
+        subq    n, m
+        incq    m
+        testq   n, n
+        jz      bignum_add_xtest
+bignum_add_xmainloop:
+        movq    (x,i,8), a
+        adcq    (y,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        decq    n
+        jnz     bignum_add_xmainloop
+        jmp     bignum_add_xtest
+bignum_add_xtoploop:
+        movq    (x,i,8), a
+        adcq    $0, a
+        movq    a, (z,i,8)
+        incq    i
+bignum_add_xtest:
+        decq    m
+        jnz     bignum_add_xtoploop
+        movl    $0, ashort
+        adcq    $0, a
+        testq   p, p
+        jnz     bignum_add_tails
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+// The case where y is longer (p >= n > m)
+
+bignum_add_ylonger:
+
+        subq    n, p
+        subq    m, n
+        testq   m, m
+        jz      bignum_add_ytoploop
+bignum_add_ymainloop:
+        movq    (x,i,8), a
+        adcq    (y,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        decq    m
+        jnz     bignum_add_ymainloop
+bignum_add_ytoploop:
+        movq    (y,i,8), a
+        adcq    $0, a
+        movq    a, (z,i,8)
+        incq    i
+        decq    n
+        jnz     bignum_add_ytoploop
+        movl    $0, ashort
+        adcq    $0, a
+        testq   p, p
+        jnz     bignum_add_tails
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+// Adding a non-trivial tail, when p > max(m,n)
+
+bignum_add_tails:
+        movq    a, (z,i,8)
+        xorq    a, a
+        jmp     bignum_add_tail
+bignum_add_tailloop:
+        movq    a, (z,i,8)
+bignum_add_tail:
+        incq    i
+        decq    p
+        jnz     bignum_add_tailloop
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontifier.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontifier.S
new file mode 100644
index 00000000000..08c910bb633
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontifier.S
@@ -0,0 +1,465 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compute "amontification" constant z :== 2^{128k} (congruent mod m)
+// Input m[k]; output z[k]; temporary buffer t[>=k]
+//
+//    extern void bignum_amontifier
+//     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+//
+// This is called "amontifier" because any other value x can now be mapped into
+// the almost-Montgomery domain with an almost-Montgomery multiplication by z.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = t
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = t
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontifier)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontifier)
+        .text
+
+#define k %rdi
+#define z %rsi
+
+// These two inputs get moved to different places since RCX and RDX are special
+
+#define m %r12
+#define t %r13
+
+// Other variables
+// Matters that c is RCX as CL=lo(c) is assumed in shifts
+
+#define i %rbx
+#define j %rbp
+#define a %rax
+#define c %rcx
+#define h %r11
+#define l %r10
+#define b %r9
+#define n %r8
+
+// Some aliases for the values b and n
+
+#define q %r8
+#define r %r9
+
+#define ashort %eax
+#define ishort %ebx
+#define jshort %ebp
+#define qshort %r8d
+
+
+S2N_BN_SYMBOL(bignum_amontifier):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save some additional registers for use, copy args out of RCX and RDX
+
+        pushq   %rbp
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+
+        movq    %rdx, m
+        movq    %rcx, t
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_amontifier_end
+
+// Copy the input m into the temporary buffer t. The temporary register
+// c matters since we want it to hold the highest digit, ready for the
+// normalization phase.
+
+        xorq    i, i
+bignum_amontifier_copyinloop:
+        movq    (m,i,8), c
+        movq    c, (t,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_amontifier_copyinloop
+
+// Do a rather stupid but constant-time digit normalization, conditionally
+// shifting left (k-1) times based on whether the top word is zero.
+// With careful binary striding this could be O(k*log(k)) instead of O(k^2)
+// while still retaining the constant-time style.
+// The "neg c" sets the zeroness predicate (~CF) for the entire inner loop
+
+        movq    k, i
+        decq    i
+        jz      bignum_amontifier_normalized
+bignum_amontifier_normloop:
+        xorq    j, j
+        movq    k, h
+        negq    c
+        movl    $0, ashort
+bignum_amontifier_shufloop:
+        movq    a, c
+        movq    (t,j,8), a
+        cmovcq  a, c
+        movq    c, (t,j,8)
+        incq    j
+        decq    h
+        jnz     bignum_amontifier_shufloop
+        decq    i
+        jnz     bignum_amontifier_normloop
+
+// We now have the top digit nonzero, assuming the input was nonzero,
+// and as per the invariant of the loop above, c holds that digit. So
+// now just count c's leading zeros and shift t bitwise that many bits.
+// Note that we don't care about the result of bsr for zero inputs so
+// the simple xor-ing with 63 is safe.
+
+bignum_amontifier_normalized:
+
+        bsrq    c, c
+        xorq    $63, c
+
+        xorq    b, b
+        xorq    i, i
+bignum_amontifier_bitloop:
+        movq    (t,i,8), a
+        movq    a, j
+        shldq   %cl, b, a
+        movq    a, (t,i,8)
+        movq    j, b
+        incq    i
+        cmpq    k, i
+        jc      bignum_amontifier_bitloop
+
+// Let h be the high word of n, which in all the in-scope cases is >= 2^63.
+// Now successively form q = 2^i div h and r = 2^i mod h as i goes from
+// 64 to 126. We avoid just using division out of constant-time concerns
+// (at the least we would need to fix up h = 0 for out-of-scope inputs) and
+// don't bother with Newton-Raphson, since this stupid simple loop doesn't
+// contribute much of the overall runtime at typical sizes.
+
+        movq    -8(t,k,8), h
+        movl    $1, qshort
+        movq    h, r
+        negq    r
+        movl    $62, ishort
+bignum_amontifier_estloop:
+
+        addq    q, q
+        movq    h, a
+        subq    r, a
+        cmpq    a, r // CF <=> r < h - r <=> 2 * r < h
+        sbbq    a, a
+        notq    a       // a = bitmask(2 * r >= h)
+        subq    a, q
+        addq    r, r
+        andq    h, a
+        subq    a, r
+        decq    i
+        jnz     bignum_amontifier_estloop
+
+// Strictly speaking the above loop doesn't quite give the true remainder
+// and quotient in the special case r = h = 2^63, so fix it up. We get
+// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is
+// supererogatory, because the main property of q used below still holds
+// in this case unless the initial m = 1, and then anyway the overall
+// specification (congruence modulo m) holds degenerately. But it seems
+// nicer to get a "true" quotient and remainder.
+
+        incq    r
+        cmpq    r, h
+        adcq    $0, q
+
+// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the
+// fixed-up case above: note that we never actually use the computed
+// value of r below and so didn't adjust it). And we can assume the ranges
+// q <= 2^63 and r < h < 2^64.
+//
+// The idea is to use q as a first quotient estimate for a remainder
+// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the
+// high and low parts h and l:
+//
+// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l)
+//                  = 2^{p+62} - (2^{p-64} * (q * h) + q * l)
+//                  = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l
+//                  = 2^{p-64} * r - q * l
+//
+// Note that 2^{p-64} * r < 2^{p-64} * h <= n
+// and also  q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n
+// so |diff| = |2^{p-64} * r - q * l| < n.
+//
+// If in fact diff >= 0 then it is already 2^{p+62} mod n.
+// otherwise diff + n is the right answer.
+//
+// To (maybe?) make the computation slightly easier we actually flip
+// the sign and compute d = q * n - 2^{p+62}. Then the answer is either
+// -d (when negative) or n - d; in either case we effectively negate d.
+// This negating tweak in fact spoils the result for cases where
+// 2^{p+62} mod n = 0, when we get n instead. However the only case
+// where this can happen is m = 1, when the whole spec holds trivially,
+// and actually the remainder of the logic below works anyway since
+// the latter part of the code only needs a congruence for the k-digit
+// result, not strict modular reduction (the doublings will maintain
+// the non-strict inequality).
+
+        xorq    c, c
+        xorq    i, i
+bignum_amontifier_mulloop:
+        movq    (t,i,8), %rax
+        mulq    q
+        addq    c, %rax
+        adcq    $0, %rdx
+        movq    %rax, (z,i,8)
+        movq    %rdx, c
+        incq    i
+        cmpq    k, i
+        jc      bignum_amontifier_mulloop
+
+// Now c is the high word of the product, so subtract 2^62
+// and then turn it into a bitmask in q = h
+
+        movq    $0x4000000000000000, %rax
+        subq    a, c
+        sbbq    q, q
+        notq    q
+
+// Now do [c] * n - d for our final answer
+
+        xorq    c, c
+        xorq    i, i
+bignum_amontifier_remloop:
+        movq    (t,i,8), a
+        andq    q, a
+        negq    c
+        sbbq    (z,i,8), a
+        sbbq    c, c
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_amontifier_remloop
+
+// Now still need to do a couple of modular doublings to get us all the
+// way up to 2^{p+64} == r from initial 2^{p+62} == r (mod n).
+
+        xorq    c, c
+        xorq    j, j
+        xorq    b, b
+bignum_amontifier_dubloop1:
+        movq    (z,j,8), a
+        shrdq   $63, a, c
+        negq    b
+        sbbq    (t,j,8), c
+        sbbq    b, b
+        movq    c, (z,j,8)
+        movq    a, c
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontifier_dubloop1
+        shrq    $63, c
+        addq    b, c
+        xorq    j, j
+        xorq    b, b
+bignum_amontifier_corrloop1:
+        movq    (t,j,8), a
+        andq    c, a
+        negq    b
+        adcq    (z,j,8), a
+        sbbq    b, b
+        movq    a, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontifier_corrloop1
+
+// This is not exactly the same: we also copy output to t giving the
+// initialization t_1 = r == 2^{p+64} mod n for the main loop next.
+
+        xorq    c, c
+        xorq    j, j
+        xorq    b, b
+bignum_amontifier_dubloop2:
+        movq    (z,j,8), a
+        shrdq   $63, a, c
+        negq    b
+        sbbq    (t,j,8), c
+        sbbq    b, b
+        movq    c, (z,j,8)
+        movq    a, c
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontifier_dubloop2
+        shrq    $63, c
+        addq    b, c
+        xorq    j, j
+        xorq    b, b
+bignum_amontifier_corrloop2:
+        movq    (t,j,8), a
+        andq    c, a
+        negq    b
+        adcq    (z,j,8), a
+        sbbq    b, b
+        movq    a, (z,j,8)
+        movq    a, (t,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontifier_corrloop2
+
+// We then successively generate (k+1)-digit values satisfying
+// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish
+// initialization by zeroing h initially
+
+        xorq    h, h
+
+// Then if t_i = 2^{p} * h + l
+// we have t_{i+1} == 2^64 * t_i
+//         = (2^{p+64} * h) + (2^64 * l)
+//        == r * h + l<<64
+// Do this k more times so we end up == 2^{128*k+64}, one more than we want
+//
+// Writing B = 2^{64k}, the possible correction of adding r, which for
+// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r
+// would give the overall worst-case value minus q of
+// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r]
+// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required.
+//
+// This implementation makes the shift implicit by starting b with the
+// "previous" digit (initially 0) to offset things by 1.
+
+        movq    k, i
+bignum_amontifier_modloop:
+        xorq    b, b
+        movq    k, n
+        xorq    j, j
+        xorq    c, c
+bignum_amontifier_cmaloop:
+        adcq    b, c
+        sbbq    l, l
+        movq    (z,j,8), %rax
+        mulq    h
+        subq    l, %rdx
+        addq    c, %rax
+        movq    (t,j,8), b
+        movq    %rax, (t,j,8)
+        movq    %rdx, c
+        incq    j
+        decq    n
+        jnz     bignum_amontifier_cmaloop
+        adcq    c, b
+        movq    b, h
+
+        sbbq    l, l
+
+        xorq    j, j
+        xorq    c, c
+bignum_amontifier_oaloop:
+        movq    (t,j,8), a
+        movq    (z,j,8), b
+        andq    l, b
+        negq    c
+        adcq    b, a
+        sbbq    c, c
+        movq    a, (t,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontifier_oaloop
+        subq    c, h
+
+        decq    i
+        jnz     bignum_amontifier_modloop
+
+// Now do one almost-Montgomery reduction w.r.t. the original m
+// which lops off one 2^64 from the congruence and, with the usual
+// almost-Montgomery correction, gets us back inside k digits for
+// the end result.
+
+        movq    (m), a
+        movq    a, c
+        movq    a, b
+        shlq    $2, c
+        subq    c, b
+        xorq    $2, b
+        movq    b, c
+        imulq   a, c
+        movl    $2, ashort
+        addq    c, a
+        addq    $1, c
+        imulq   a, b
+        imulq   c, c
+        movl    $1, ashort
+        addq    c, a
+        imulq   a, b
+        imulq   c, c
+        movl    $1, ashort
+        addq    c, a
+        imulq   a, b
+        imulq   c, c
+        movl    $1, ashort
+        addq    c, a
+        imulq   a, b
+
+        movq    (t), c
+        imulq   c, b
+
+        movq    (m), %rax
+        mulq    b
+        addq    c, %rax
+        movq    %rdx, c
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_amontifier_montend
+
+bignum_amontifier_montloop:
+        adcq    (t,j,8), c
+        sbbq    l, l
+        movq    (m,j,8), %rax
+        mulq    b
+        subq    l, %rdx
+        addq    c, %rax
+        movq    %rax, -8(t,j,8)
+        movq    %rdx, c
+        incq    j
+        decq    n
+        jnz     bignum_amontifier_montloop
+bignum_amontifier_montend:
+        adcq    c, h
+        sbbq    l, l
+        movq    h, -8(t,k,8)
+
+        xorq    j, j
+        xorq    c, c
+bignum_amontifier_osloop:
+        movq    (t,j,8), a
+        movq    (m,j,8), b
+        andq    l, b
+        negq    c
+        sbbq    b, a
+        sbbq    c, c
+        movq    a, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontifier_osloop
+
+ bignum_amontifier_end:
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontmul.S
new file mode 100644
index 00000000000..d40c8aecfc0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontmul.S
@@ -0,0 +1,249 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m)
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_amontmul
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+//
+// Does z :== (x * y / 2^{64k}) mod m, meaning that the result, in the native
+// size k, is congruent modulo m, but might not be fully reduced mod m. This
+// is why it is called *almost* Montgomery multiplication.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontmul)
+        .text
+
+// We copy x into %r9 but it comes in in %rdx originally
+
+#define k %rdi
+#define z %rsi
+#define x %r9
+#define y %rcx
+#define m %r8
+
+// General temp, low part of product and mul input
+#define a %rax
+// General temp, High part of product
+#define b %rdx
+// Inner loop counter
+#define j %rbx
+// Home for i'th digit or Montgomery multiplier
+#define d %rbp
+#define h %r10
+#define e %r11
+#define n %r12
+#define i %r13
+#define c0 %r14
+#define c1 %r15
+
+// This one variable we store on the stack as we are a register short.
+// At least it's only used once per iteration of the outer loop (k times)
+// and with a single read each time, after one initial write. The variable
+// is the word-level negated modular inverse
+
+#define w  (%rsp)
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+
+#define t1 %rbx
+#define t2 %rdx
+
+#define ashort %eax
+#define jshort %ebx
+
+
+S2N_BN_SYMBOL(bignum_amontmul):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Save registers and allocate space on stack for non-register variable w
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $8, %rsp
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_amontmul_end
+
+// Move x input into its permanent home, since we need %rdx for multiplications
+
+        movq    %rdx, x
+
+// Compute word-level negated modular inverse w for m == m[0].
+
+        movq    (m), a
+
+        movq    a, t2
+        movq    a, t1
+        shlq    $2, t2
+        subq    t2, t1
+        xorq    $2, t1
+
+        movq    t1, t2
+        imulq   a, t2
+        movl    $2, ashort
+        addq    t2, a
+        addq    $1, t2
+
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        movq    t1, w
+
+// Initialize the output c0::z to zero so we can then consistently add rows.
+// It would be a bit more efficient to special-case the zeroth row, but
+// this keeps the code slightly simpler.
+
+        xorq    i, i // Also initializes i for main loop
+        xorq    j, j
+bignum_amontmul_zoop:
+        movq    i, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontmul_zoop
+
+        xorq    c0, c0
+
+// Outer loop pulling down digits d=x[i], multiplying by y and reducing
+
+bignum_amontmul_outerloop:
+
+// Multiply-add loop where we always have CF + previous high part h to add in.
+// Note that in general we do need yet one more carry in this phase and hence
+// initialize c1 with the top carry.
+
+        movq    (x,i,8), d
+        xorq    j, j
+        xorq    h, h
+        xorq    c1, c1
+        movq    k, n
+
+bignum_amontmul_maddloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (y,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, (z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    n
+        jnz     bignum_amontmul_maddloop
+        adcq    h, c0
+        adcq    c1, c1
+
+// Montgomery reduction loop, similar but offsetting writebacks
+
+        movq    (z), e
+        movq    w, d
+        imulq   e, d
+        movq    (m), a
+        mulq    d
+        addq    e, a // Will be zero but want the carry
+        movq    %rdx, h
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_amontmul_montend
+
+bignum_amontmul_montloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (m,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, -8(z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    n
+        jnz     bignum_amontmul_montloop
+
+bignum_amontmul_montend:
+        adcq    c0, h
+        adcq    $0, c1
+        movq    c1, c0
+        movq    h, -8(z,j,8)
+
+// End of outer loop.
+
+        incq    i
+        cmpq    k, i
+        jc      bignum_amontmul_outerloop
+
+// Now convert carry word, which is always in {0,1}, into a mask "d"
+// and do a masked subtraction of m for the final almost-Montgomery result.
+
+        xorq    d, d
+        subq    c0, d
+        xorq    e, e
+        xorq    j, j
+bignum_amontmul_corrloop:
+        movq    (m,j,8), a
+        andq    d, a
+        negq    e
+        sbbq    a, (z,j,8)
+        sbbq    e, e
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontmul_corrloop
+
+bignum_amontmul_end:
+
+        addq    $8, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontredc.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontredc.S
new file mode 100644
index 00000000000..c28d9f3d1f1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontredc.S
@@ -0,0 +1,246 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m)
+// Inputs x[n], m[k], p; output z[k]
+//
+//    extern void bignum_amontredc
+//     (uint64_t k, uint64_t *z,
+//      uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+//
+// Does a :== (x' / 2^{64p}) mod m where x' = x if n <= p + k and in general
+// is the lowest (p+k) digits of x. That is, p-fold almost-Montgomery reduction
+// w.r.t. a k-digit modulus m giving a k-digit answer.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = m, R9 = p
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = m, [RSP+48] = p
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontredc)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontredc)
+        .text
+
+// We copy x into %r10 but it comes in in %rdx originally
+
+#define k %rdi
+#define z %rsi
+#define n %r10
+#define x %rcx
+#define m %r8
+#define p %r9
+
+// General temp, low part of product and mul input
+#define a %rax
+// General temp, High part of product
+#define b %rdx
+// Negated modular inverse
+#define w  (%rsp)
+// Inner loop counter
+#define j %rbx
+// Home for i'th digit or Montgomery multiplier
+#define d %rbp
+#define h %r11
+#define e %r12
+#define t %r13
+#define i %r14
+#define c %r15
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+
+#define t1 %rbx
+#define t2 %r14
+
+#define ashort %eax
+#define cshort %r15d
+#define jshort %ebx
+
+
+S2N_BN_SYMBOL(bignum_amontredc):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+        movq    64(%rsp), %r9
+#endif
+
+// Save registers and allocate space on stack for non-register variable w
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $8, %rsp
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_amontredc_end
+
+// Move n input into its permanent home, since we need %rdx for multiplications
+
+        movq    %rdx, n
+
+// Compute word-level negated modular inverse w for m == m[0].
+
+        movq    (m), a
+
+        movq    a, t2
+        movq    a, t1
+        shlq    $2, t2
+        subq    t2, t1
+        xorq    $2, t1
+
+        movq    t1, t2
+        imulq   a, t2
+        movl    $2, ashort
+        addq    t2, a
+        addq    $1, t2
+
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        movq    t1, w
+
+// Initialize z to the lowest k digits of the input, zero-padding if n < k.
+
+        movq    k, j
+        cmpq    k, n
+        cmovcq  n, j
+        xorq    i, i
+        testq   j, j
+        jz      bignum_amontredc_padloop
+bignum_amontredc_copyloop:
+        movq    (x,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    j, i
+        jc      bignum_amontredc_copyloop
+
+        cmpq    k, i
+        jnc     bignum_amontredc_initialized
+
+        xorq    j, j
+bignum_amontredc_padloop:
+        movq    j, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_amontredc_padloop
+
+bignum_amontredc_initialized:
+        xorq    c, c
+
+// Now if p = 0 that's the end of the operation
+
+        testq   p, p
+        jz      bignum_amontredc_end
+
+// Outer loop, just doing a standard Montgomery reduction on z
+
+        xorq    i, i
+bignum_amontredc_outerloop:
+        movq    (z), e
+        movq    w, d
+        imulq   e, d
+        movq    (m), a
+        mulq    d
+        addq    e, a // Will be zero but want the carry
+        movq    %rdx, h
+        movl    $1, jshort
+        movq    k, t
+        decq    t
+        jz      bignum_amontredc_montend
+
+bignum_amontredc_montloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (m,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, -8(z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    t
+        jnz     bignum_amontredc_montloop
+
+bignum_amontredc_montend:
+        adcq    c, h
+        movl    $0, cshort
+        adcq    $0, c
+
+        addq    i, j
+        cmpq    n, j
+        jnc     bignum_amontredc_offtheend
+        movq    (x,j,8), a
+        addq    a, h
+        adcq    $0, c
+bignum_amontredc_offtheend:
+        movq    h, -8(z,k,8)
+
+// End of outer loop.
+
+        incq    i
+        cmpq    p, i
+        jc      bignum_amontredc_outerloop
+
+// Now convert carry word, which is always in {0,1}, into a mask "d"
+// and do a masked subtraction of m for the final almost-Montgomery result.
+
+        xorq    d, d
+        subq    c, d
+        xorq    e, e
+        xorq    j, j
+bignum_amontredc_corrloop:
+        movq    (m,j,8), a
+        andq    d, a
+        negq    e
+        sbbq    a, (z,j,8)
+        sbbq    e, e
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontredc_corrloop
+
+bignum_amontredc_end:
+        addq    $8, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontsqr.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontsqr.S
new file mode 100644
index 00000000000..c6549f7f731
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontsqr.S
@@ -0,0 +1,236 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m)
+// Inputs x[k], m[k]; output z[k]
+//
+//    extern void bignum_amontsqr
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+//
+// Does z :== (x^2 / 2^{64k}) mod m, meaning that the result, in the native
+// size k, is congruent modulo m, but might not be fully reduced mod m. This
+// is why it is called *almost* Montgomery squaring.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontsqr)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontsqr)
+        .text
+
+// We copy x into %r9 but it comes in in %rdx originally
+
+#define k %rdi
+#define z %rsi
+#define x %r9
+#define m %rcx
+
+// General temp, low part of product and mul input
+#define a %rax
+// General temp, High part of product
+#define b %rdx
+// Negated modular inverse
+#define w %r8
+// Inner loop counter
+#define j %rbx
+// Home for i'th digit or Montgomery multiplier
+#define d %rbp
+#define h %r10
+#define e %r11
+#define n %r12
+#define i %r13
+#define c0 %r14
+#define c1 %r15
+
+// A temp reg in the initial word-level negmodinv.
+
+#define t2 %rdx
+
+#define ashort %eax
+#define jshort %ebx
+
+
+S2N_BN_SYMBOL(bignum_amontsqr):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save registers
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_amontsqr_end
+
+// Move x input into its permanent home, since we need %rdx for multiplications
+
+        movq    %rdx, x
+
+// Compute word-level negated modular inverse w for m == m[0].
+
+        movq    (m), a
+
+        movq    a, t2
+        movq    a, w
+        shlq    $2, t2
+        subq    t2, w
+        xorq    $2, w
+
+        movq    w, t2
+        imulq   a, t2
+        movl    $2, ashort
+        addq    t2, a
+        addq    $1, t2
+
+        imulq   a, w
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, w
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, w
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, w
+
+// Initialize the output c0::z to zero so we can then consistently add rows.
+// It would be a bit more efficient to special-case the zeroth row, but
+// this keeps the code slightly simpler.
+
+        xorq    i, i // Also initializes i for main loop
+        xorq    j, j
+bignum_amontsqr_zoop:
+        movq    i, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontsqr_zoop
+
+        xorq    c0, c0
+
+// Outer loop pulling down digits d=x[i], multiplying by x and reducing
+
+bignum_amontsqr_outerloop:
+
+// Multiply-add loop where we always have CF + previous high part h to add in.
+// Note that in general we do need yet one more carry in this phase and hence
+// initialize c1 with the top carry.
+
+        movq    (x,i,8), d
+        xorq    j, j
+        xorq    h, h
+        xorq    c1, c1
+        movq    k, n
+
+bignum_amontsqr_maddloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (x,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, (z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    n
+        jnz     bignum_amontsqr_maddloop
+        adcq    h, c0
+        adcq    c1, c1
+
+// Montgomery reduction loop, similar but offsetting writebacks
+
+        movq    (z), e
+        movq    w, d
+        imulq   e, d
+        movq    (m), a
+        mulq    d
+        addq    e, a // Will be zero but want the carry
+        movq    %rdx, h
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_amontsqr_montend
+
+bignum_amontsqr_montloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (m,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, -8(z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    n
+        jnz     bignum_amontsqr_montloop
+
+bignum_amontsqr_montend:
+        adcq    c0, h
+        adcq    $0, c1
+        movq    c1, c0
+        movq    h, -8(z,j,8)
+
+// End of outer loop.
+
+        incq    i
+        cmpq    k, i
+        jc      bignum_amontsqr_outerloop
+
+// Now convert carry word, which is always in {0,1}, into a mask "d"
+// and do a masked subtraction of m for the final almost-Montgomery result.
+
+        xorq    d, d
+        subq    c0, d
+        xorq    e, e
+        xorq    j, j
+bignum_amontsqr_corrloop:
+        movq    (m,j,8), a
+        andq    d, a
+        negq    e
+        sbbq    a, (z,j,8)
+        sbbq    e, e
+        incq    j
+        cmpq    k, j
+        jc      bignum_amontsqr_corrloop
+
+bignum_amontsqr_end:
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitfield.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitfield.S
new file mode 100644
index 00000000000..6b8e366b511
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitfield.S
@@ -0,0 +1,124 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Select bitfield starting at bit n with length l <= 64
+// Inputs x[k], n, l; output function return
+//
+//    extern uint64_t bignum_bitfield
+//     (uint64_t k, uint64_t *x, uint64_t n, uint64_t l);
+//
+// One-word bitfield from a k-digit (digit=64 bits) bignum, in constant-time
+// style. Bitfield starts at bit n and has length l, indexing from 0 (=LSB).
+// Digits above the top are treated uniformly as zero, as usual. Since the
+// result is returned in a single word, effectively we use l' = min(64,l)
+// for the length.
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, RDX = n, RCX = l, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, R8 = n, R9 = l, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bitfield)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bitfield)
+        .text
+
+#define k %rdi
+#define x %rsi
+#define n %rdx
+#define l %rcx
+
+#define d %r8
+#define e %rax
+#define i %r9
+#define a %r10
+#define m %r11
+
+#define mshort %r11d
+
+
+
+S2N_BN_SYMBOL(bignum_bitfield):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Initialize second of digit pair to zero and if length is zero finish
+// immediately; the digit e is also the return value in RAX
+
+        xorq    e, e
+        testq   k, k
+        jz      bignum_bitfield_end
+
+// Decompose the index into n = 64 * n + m, then increment n for next part
+
+        movl    $63, mshort
+        andq    n, m
+        shrq    $6, n
+        incq    n
+
+// Run over the digits setting d = n'th and e = (n+1)'th
+
+        xorq    i, i
+bignum_bitfield_loop:
+        movq    (x,i,8), a
+        cmpq    n, i
+        cmovcq  a, d
+        cmovzq  a, e
+        incq    i
+        cmpq    k, i
+        jc      bignum_bitfield_loop
+
+// Put zero in a register, for several purposes
+
+        xorq    a, a
+
+// Override d with 0 if we ran off the end (e will retain original 0).
+
+        cmpq    n, i
+        cmovcq  a, d
+
+// Override e if we have m = 0 (i.e. original n was divisible by 64)
+// This is because then we want to shift it right by 64 below.
+
+        testq   m, m
+        cmovzq  a, e
+
+// Create a size-l bitmask first (while the shift is conveniently in CL)
+
+        cmpq    $64, l
+        adcq    a, a
+        shlq    %cl, a
+        decq    a
+
+// Combine shifted digits to get the bitfield(n,64)
+
+        movq    m, l
+        shrq    %cl, d
+        negq    %rcx
+        shlq    %cl, e
+        orq     d, e
+
+// Now mask it down to get bitfield (n,l)
+
+        andq    a, e
+
+bignum_bitfield_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitsize.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitsize.S
new file mode 100644
index 00000000000..2d7331a8626
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitsize.S
@@ -0,0 +1,88 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return size of bignum in bits
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is 0
+//
+// In principle this has a precondition k < 2^58, but obviously that
+// is always true in practice because of address space limitations.
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bitsize)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bitsize)
+        .text
+
+#define k %rdi
+#define x %rsi
+#define i %rax
+#define w %rdx
+#define a %rcx
+#define j %r8
+
+
+
+S2N_BN_SYMBOL(bignum_bitsize):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Initialize the index i and also prepare default return value of 0 (i = %rax)
+
+        xorq    i, i
+
+// If the bignum is zero-length, just return 0
+
+        testq   k, k
+        jz      bignum_bitsize_end
+
+// Use w = a[i-1] to store nonzero words in a bottom-up sweep
+// Set the initial default to be as if we had a 11...11 word directly below
+
+        movq    $-1, w
+        xorq    j, j
+bignum_bitsize_loop:
+        movq    (x,j,8), a
+        incq    j
+        testq   a, a
+        cmovnzq j, i
+        cmovnzq a, w
+        cmpq    k, j
+        jnz     bignum_bitsize_loop
+
+// Now w = a[i-1] is the highest nonzero word, or in the zero case the
+// default of the "extra" 11...11 = a[0-1]. We now want 64* i - clz(w) =
+// 64 * i - (63 - bsr(w)) = (64 * i - 63) + bsr(w). Note that this code
+// does not rely on the behavior of the bsr instruction for zero inputs,
+// which is undefined.
+
+        shlq    $6, i
+        subq    $63, i
+        bsrq    w, w
+        addq    w, %rax
+
+bignum_bitsize_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv.S
new file mode 100644
index 00000000000..49d17e97828
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv.S
@@ -0,0 +1,336 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Divide by a single (nonzero) word, z := x / m and return x mod m
+// Inputs x[n], m; outputs function return (remainder) and z[k]
+//
+//    extern uint64_t bignum_cdiv
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+//
+// Does the "z := x / m" operation where x is n digits, result z is k.
+// Truncates the quotient in general, but always (for nonzero m) returns
+// the true remainder x mod m.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cdiv)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cdiv)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define m %r8
+
+// These parameters get moved because of special uses for %rcx, %rdx
+
+#define n %r9
+#define x %r10
+
+// This needs to be in %rcx for variable shifts with %cl
+
+#define e %rcx
+
+// Other variables
+
+#define w %r11
+#define d %r12
+#define i %rbx
+#define c %r13
+#define l %r14
+
+#define a %rax
+#define h %rdx
+
+#define ashort %eax
+#define ishort %ebx
+#define hshort %edx
+
+// The remainder
+
+#define r %r15
+
+S2N_BN_SYMBOL(bignum_cdiv):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Move parameters that need a new home
+
+        movq    %rdx, n
+        movq    %rcx, x
+
+// First do a modulus computation, slightly tweaked from bignum_cmod,
+// changing variables and avoiding modification of the size parameter.
+// Initialize l = 0 now for convenience (we eventually need to do it).
+// If the bignum is zero-length, l is already the right answer of 0
+
+        xorq    l, l
+        testq   n, n
+        jz      bignum_cdiv_nomodulus
+
+        bsrq    m, e
+        xorq    $63, e
+        shlq    %cl, m
+
+        movq   m, r
+        movq   $0x1FFFFFFFFFFFF, w
+        shrq   $16, r
+        xorq   r, w
+        incq   r
+        shrq   $32, w
+        movq   r, h
+        imulq  w, h
+        negq   h
+        movq   h, a
+        shrq   $49, a
+        imulq  a, a
+        shrq   $34, h
+        addq   a, h
+        orq    $0x40000000, a
+        imulq  h, a
+        shrq   $30, a
+        imulq  w, a
+        shlq   $30, w
+        addq   a, w
+        shrq   $30, w
+        movq   r, h
+        imulq  w, h
+        negq   h
+        shrq   $24, h
+        imulq  w, h
+        shlq   $16, w
+        shrq   $24, h
+        addq   h, w
+        movq   r, h
+        imulq  w, h
+        negq   h
+        shrq   $32, h
+        imulq  w, h
+        shlq   $31, w
+        shrq   $17, h
+        addq   h, w
+        movq   m, a
+        mulq   w
+        shrdq  $60, h, a
+        movq   w, h
+        shrq   $33, h
+        notq   a
+        imulq  h, a
+        shlq   $1, w
+        shrq   $33, a
+        addq   a, w
+        addq   $1, w
+        movq   m, a
+        sbbq   $0, w
+        mulq   w
+        addq   m, h
+        sbbq   $0, w
+
+        movq    m, r
+        imulq   w, r
+        negq    r
+
+        xorl    hshort, hshort
+        movq    n, i
+bignum_cdiv_modloop:
+        movq    h, a
+        mulq    r
+        addq    -8(x,i,8), a
+        adcq    l, h
+        movq    a, l
+        sbbq    a, a
+        andq    r, a
+        addq    a, l
+        adcq    $0, h
+        decq    i
+        jnz     bignum_cdiv_modloop
+
+        movq    h, i
+        movq    w, a
+        mulq    h
+        addq    i, h
+        sbbq    r, r
+        andq    m, r
+
+        movq    h, a
+        mulq    m
+        addq    r, h
+        xorq    r, r
+        subq    a, l
+        sbbq    h, i
+
+        cmovnzq m, r
+        xorl    ashort, ashort
+        subq    r, l
+        sbbq    a, i
+
+        cmovnzq m, a
+        subq    a, l
+
+        movq    w, a
+        mulq    l
+        addq    l, h
+        rcr     $1, h
+
+        shrq    %cl, m
+        xorq    $63, e
+        shrq    %cl, h
+
+        imulq   m, h
+        subq    h, l
+
+        movq    l, r
+        subq    m, l
+bignum_cdiv_nomodulus:
+        cmovncq l, r
+
+// If k = 0 then there's no more to be done
+
+        testq   k, k
+        jz      bignum_cdiv_end
+
+// Let e be the number of trailing zeros in m (we can ignore m = 0)
+
+        bsfq    m, e
+
+// Now just shift m right by e bits. So hereafter we can assume m is odd
+// but we first need to shift the input right by e bits then divide by m.
+
+        shrq   %cl, m
+
+// Compute the negated modular inverse w with w * m + 1 == 0 (mod 2^64)
+// This is essentially the same as word_negmodinv.
+
+        movq    m, a
+        movq    m, w
+        shlq    $2, a
+        subq    a, w
+        xorq    $2, w
+        movq    w, a
+        imulq   m, a
+        movl    $2, hshort
+        addq    a, h
+        addq    $1, a
+        imulq   h, w
+        imulq   a, a
+        movl    $1, hshort
+        addq    a, h
+        imulq   h, w
+        imulq   a, a
+        movl    $1, hshort
+        addq    a, h
+        imulq   h, w
+        imulq   a, a
+        movl    $1, hshort
+        addq    a, h
+        imulq   h, w
+
+// We have the remainder r, so now x = m * y + r for some quotient y
+// to be computed. Consider x' = x + (m - r) = m * (y + 1) and do a
+// Montgomery reduction, keeping the cofactor z. This gives us
+// x' + m * z = 2^{64k} * c where c <= m. Thus since x' = m * (y + 1)
+// we have
+//
+//     m * (y + z + 1) = 2^{64k} * c
+//
+// This means m * (y + z + 1) == 0 (mod 2^{64k}), even when we truncate
+// x to k digits (if in fact k < n). Since m is odd, it's coprime to
+// 2^{64k} so we can cancel and get y + z + 1 == 0 (mod 2^{64k}), and
+// hence using logical complement y == ~z (mod 2^{64k}). Thus we can
+// write back the logical complements of the cofactor as the answer.
+// Start with carry word c = m - r/2^e to make the initial tweak
+// x' = x + (m - r); since we've shifted everything initially by e
+// we need to shift the remainder too before subtracting from the
+// shifted m.
+
+        movq    r, d
+        shrq    %cl, d
+        movq    m, c
+        subq    d, c
+        xorl    ishort, ishort
+
+// Unless n = 0, preload the zeroth digit and bump up the x pointer by
+// 8 and n down by 1, to ease indexing and comparison using the same
+// variable i in the main loop. When n = 0 we leave it alone, as the
+// comparison i < n will always fail and the x pointer is unused.
+
+        xorq    d, d
+        testq   n, n
+        jz      bignum_cdiv_loop
+        movq    (x), d
+        addq    $8, x
+        decq    n
+
+bignum_cdiv_loop:
+
+// Load the next digit up to get [l,d] then shift right e places
+
+        xorq    l, l
+        cmpq    n, i
+        jnc     bignum_cdiv_noload
+        movq    (x,i,8), l
+bignum_cdiv_noload:
+        shrdq   %cl, l, d
+        addq    c, d
+        sbbq    c, c
+        negq    c
+
+// Now the effective sum is [c,a] where the carry-in has been absorbed.
+// Do the main Montgomery step with the (odd) m, writing back ~q. Finally
+// set d to the next digit ready for the following iteration.
+
+        movq    w, a
+        imulq   d, a
+        notq    a
+        movq    a, (z,i,8)
+        notq    a
+
+        mulq    m
+        addq    d, a
+        adcq    h, c
+
+        movq    l, d
+
+        incq    i
+        cmpq    k, i
+        jc      bignum_cdiv_loop
+
+// Return the modulus
+
+bignum_cdiv_end:
+        movq    r, %rax
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv_exact.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv_exact.S
new file mode 100644
index 00000000000..98cfa63b70f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv_exact.S
@@ -0,0 +1,193 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Divide by a single word, z := x / m *when known to be exact*
+// Inputs x[n], m; output z[k]
+//
+//    extern void bignum_cdiv_exact
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m);
+//
+// Does the "z := x / m" operation where x is n digits and result z is k,
+// *assuming* that m is nonzero and that the input x is in fact an
+// exact multiple of m. (If this isn't known, use the general bignum_cdiv
+// function instead.) In general the result is truncated to k digits.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cdiv_exact)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cdiv_exact)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define m %r8
+
+// These parameters get moved because of special uses for %rcx, %rdx
+
+#define n %r9
+#define x %r10
+
+// This needs to be in %rcx for variable shifts with %cl
+
+#define e %rcx
+
+// Other variables
+
+#define w %r11
+#define d %r12
+#define i %rbx
+#define c %r13
+#define t %r14
+
+#define a %rax
+#define h %rdx
+
+#define ishort %ebx
+#define hshort %edx
+
+S2N_BN_SYMBOL(bignum_cdiv_exact):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+
+// If k = 0 then there's nothing to be done
+
+        testq   k, k
+        jz      bignum_cdiv_exact_end
+
+// Move parameters that need a new home
+
+        movq    %rdx, n
+        movq    %rcx, x
+
+// Let e be the number of trailing zeros in m (we can ignore m = 0)
+
+        bsfq    m, e
+
+// Now just shift m right by e bits. So hereafter we can assume m is odd
+// but we first need to shift the input right by e bits then divide by m.
+
+        shrq   %cl, m
+
+// Compute the negated modular inverse w with w * m + 1 == 0 (mod 2^64)
+// This is essentially the same as word_negmodinv.
+
+        movq    m, a
+        movq    m, w
+        shlq    $2, a
+        subq    a, w
+        xorq    $2, w
+        movq    w, a
+        imulq   m, a
+        movl    $2, hshort
+        addq    a, h
+        addq    $1, a
+        imulq   h, w
+        imulq   a, a
+        movl    $1, hshort
+        addq    a, h
+        imulq   h, w
+        imulq   a, a
+        movl    $1, hshort
+        addq    a, h
+        imulq   h, w
+        imulq   a, a
+        movl    $1, hshort
+        addq    a, h
+        imulq   h, w
+
+// Consider x' = x + m and do a Montgomery reduction, keeping the cofactor z.
+// This gives us x' + m * z = 2^{64k} * c where c <= m. Assuming x = m * y
+// we then have m * y + m + m * z = 2^{64k} * c, i.e.
+//
+//     m * (y + z + 1) = 2^{64k} * c
+//
+// This means m * (y + z + 1) == 0 (mod 2^{64k}), even when we truncate
+// x to k digits (if in fact k < n). Since m is odd, it's coprime to
+// 2^{64k} so we can cancel and get y + z + 1 == 0 (mod 2^{64k}), and
+// hence using logical complement y == ~z (mod 2^{64k}). Thus we can
+// write back the logical complements of the cofactor as the answer.
+// Start with carry word c = m to make the initial tweak x' = x + m.
+
+        movq    m, c
+        xorl    ishort, ishort
+
+// Unless n = 0, preload the zeroth digit and bump up the x pointer by
+// 8 and n down by 1, to ease indexing and comparison using the same
+// variable i in the main loop. When n = 0 we leave it alone, as the
+// comparison i < n will always fail and the x pointer is unused.
+
+        xorq    d, d
+        testq   n, n
+        jz      bignum_cdiv_exact_loop
+        movq    (x), d
+        addq    $8, x
+        decq    n
+
+bignum_cdiv_exact_loop:
+
+// Load the next digit up to get [t,d] then shift right e places
+
+        xorq    t, t
+        cmpq    n, i
+        jnc     bignum_cdiv_exact_noload
+        movq    (x,i,8), t
+bignum_cdiv_exact_noload:
+        shrdq   %cl, t, d
+        addq    c, d
+        sbbq    c, c
+        negq    c
+
+// Now the effective sum is [c,a] where the carry-in has been absorbed.
+// Do the main Montgomery step with the (odd) m, writing back ~q. Finally
+// set d to the next digit ready for the following iteration.
+
+        movq    w, a
+        imulq   d, a
+        notq    a
+        movq    a, (z,i,8)
+        notq    a
+
+        mulq    m
+        addq    d, a
+        adcq    h, c
+
+        movq    t, d
+
+        incq    i
+        cmpq    k, i
+        jc      bignum_cdiv_exact_loop
+
+bignum_cdiv_exact_end:
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cld.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cld.S
new file mode 100644
index 00000000000..a3581a6b7a5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cld.S
@@ -0,0 +1,73 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count leading zero digits (64-bit words)
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_cld (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is k
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cld)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cld)
+        .text
+
+#define k %rdi
+#define x %rsi
+#define i %rax
+#define a %rcx
+#define j %rdx
+
+
+
+S2N_BN_SYMBOL(bignum_cld):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Initialize the index i and also prepare default return value of 0 (i = %rax)
+
+        xorq    i, i
+
+// If the bignum is zero-length, just return k = 0
+
+        testq   k, k
+        jz      bignum_cld_end
+
+// Run over the words j = 0..i-1, and set i := j + 1 when hitting nonzero a[j]
+
+        xorq    j, j
+bignum_cld_loop:
+        movq    (x,j,8), a
+        incq    j
+        testq   a, a
+        cmovnzq j, i
+        cmpq    k, j
+        jnz     bignum_cld_loop
+
+        negq    %rax
+        addq    %rdi, %rax
+
+bignum_cld_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_clz.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_clz.S
new file mode 100644
index 00000000000..f4014f7e35e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_clz.S
@@ -0,0 +1,88 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count leading zero bits
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_clz (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is 64 * k
+//
+// In principle this has a precondition k < 2^58, but obviously that
+// is always true in practice because of address space limitations
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_clz)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_clz)
+        .text
+
+#define k %rdi
+#define x %rsi
+#define i %rax
+#define w %rdx
+#define a %rcx
+#define j %r8
+
+
+
+S2N_BN_SYMBOL(bignum_clz):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Initialize the index i and also prepare default return value of 0 (i = %rax)
+
+        xorq    i, i
+
+// If the bignum is zero-length, just return 0
+
+        testq   k, k
+        jz      bignum_clz_end
+
+// Use w = a[i-1] to store nonzero words in a bottom-up sweep
+// Set the initial default to be as if we had a 11...11 word directly below
+
+        movq    $-1, w
+        xorq    j, j
+bignum_clz_loop:
+        movq    (x,j,8), a
+        incq    j
+        testq   a, a
+        cmovnzq j, i
+        cmovnzq a, w
+        cmpq    k, j
+        jnz     bignum_clz_loop
+
+// Now w = a[i-1] is the highest nonzero word, or in the zero case the
+// default of the "extra" 11...11 = a[0-1]. We now want 64*(k - i) + clz(w) =
+// 64*(k - i) + (63 - bsr(w)). Note that this code does not rely on the
+// behavior of the bsr instruction for zero inputs, where it is undefined
+
+        subq    i, k
+        shlq    $6, k
+        bsrq    w, %rax
+        xorq    $63, %rax
+        addq    k, %rax
+
+bignum_clz_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmadd.S
new file mode 100644
index 00000000000..d423ebb00c8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmadd.S
@@ -0,0 +1,144 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply-add with single-word multiplier, z := z + c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_cmadd
+//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := z + c * y" operation where y is n digits, result z is p.
+// Truncates the result in general.
+//
+// The return value is a high/carry word that is meaningful when p = n + 1, or
+// more generally when n <= p and the result fits in p + 1 digits. In these
+// cases it gives the top digit of the (p + 1)-digit result.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd)
+        .text
+
+#define p %rdi
+#define z %rsi
+#define c %r9
+#define n %rcx
+#define x %r8
+
+#define i %r10
+#define h %r11
+
+#define r %rbx
+
+#define hshort %r11d
+#define ishort %r10d
+
+
+
+S2N_BN_SYMBOL(bignum_cmadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Seems hard to avoid one more register
+
+        pushq   %rbx
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output.
+// Subtract p := p - min(n,p) so it holds the size of the extra tail needed
+
+        cmpq    n, p
+        cmovcq  p, n
+        subq    n, p
+
+// Initialize high part h = 0; if n = 0 do nothing but return that zero
+
+        xorq    h, h
+        testq   n, n
+        jz      bignum_cmadd_end
+
+// Move c into a safer register as multiplies overwrite %rdx
+
+        movq    %rdx, c
+
+// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0
+
+        movq    (x), %rax
+        mulq    c
+        addq    %rax, (z)
+        movq    %rdx, h
+        movl    $1, ishort
+        decq    n
+        jz      bignum_cmadd_hightail
+
+// Main loop, where we always have CF + previous high part h to add in
+
+bignum_cmadd_loop:
+        adcq    (z,i,8), h
+        sbbq    r, r
+        movq    (x,i,8), %rax
+        mulq    c
+        subq    r, %rdx
+        addq    h, %rax
+        movq    %rax, (z,i,8)
+        movq    %rdx, h
+        incq    i
+        decq    n
+        jnz     bignum_cmadd_loop
+
+bignum_cmadd_hightail:
+        adcq    $0, h
+
+// Propagate the carry all the way to the end with h as extra carry word
+
+bignum_cmadd_tail:
+        testq   p, p
+        jz      bignum_cmadd_end
+
+        addq    h, (z,i,8)
+        movl    $0, hshort
+        incq    i
+        decq    p
+        jz      bignum_cmadd_highend
+
+bignum_cmadd_tloop:
+        adcq    h, (z,i,8)
+        incq    i
+        decq    p
+        jnz     bignum_cmadd_tloop
+
+bignum_cmadd_highend:
+
+        adcq    $0, h
+
+// Return the high/carry word
+
+bignum_cmadd_end:
+        movq    h, %rax
+
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmnegadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmnegadd.S
new file mode 100644
index 00000000000..cc9e80ccfd0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmnegadd.S
@@ -0,0 +1,154 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negated multiply-add with single-word multiplier, z := z - c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_cmnegadd
+//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := z - c * y" operation where y is n digits, result z is p.
+// Truncates the result in general.
+//
+// The return value is a high/carry word that is meaningful when n <= p.
+// It is interpreted negatively as z' - 2^{64k} * return = z - c * y.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmnegadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmnegadd)
+        .text
+
+#define p %rdi
+#define z %rsi
+#define c %r9
+#define n %rcx
+#define x %r8
+
+#define i %r10
+#define h %r11
+
+#define r %rbx
+
+#define hshort %r11d
+#define ishort %r10d
+
+
+
+S2N_BN_SYMBOL(bignum_cmnegadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Seems hard to avoid one more register
+
+        pushq   %rbx
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output.
+// Subtract p := p - min(n,p) so it holds the size of the extra tail needed
+
+        cmpq    n, p
+        cmovcq  p, n
+        subq    n, p
+
+// Initialize high part h = 0; if n = 0 do nothing but return that zero
+
+        xorq    h, h
+        testq   n, n
+        jz      bignum_cmnegadd_end
+
+// Move c into a safer register as multiplies overwrite %rdx
+
+        movq    %rdx, c
+
+// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * ~x_0 + c
+
+        movq    (x), %rax
+        notq    %rax
+        mulq    c
+        addq    c, %rax
+        adcq    $0, %rdx
+        addq    %rax, (z)
+        movq    %rdx, h
+        movl    $1, ishort
+        decq    n
+        jz      bignum_cmnegadd_tail
+
+// Main loop, where we always have CF + previous high part h to add in
+
+bignum_cmnegadd_loop:
+        adcq    (z,i,8), h
+        sbbq    r, r
+        movq    (x,i,8), %rax
+        notq    %rax
+        mulq    c
+        subq    r, %rdx
+        addq    h, %rax
+        movq    %rax, (z,i,8)
+        movq    %rdx, h
+        incq    i
+        decq    n
+        jnz     bignum_cmnegadd_loop
+
+// At this point we have 2^{64n} * (h + CF) + z' = z + c * (2^{64n} - x)
+// so z' - 2^{64n} * (c - (h + CF)) = z - c * x.
+// Since z - c * x < 2^{64n} we must have c - (h + CF) >= 0.
+// Accumulate the negative carry in h for consistency with trivial cases.
+
+bignum_cmnegadd_tail:
+        sbbq    h, c
+        movq    c, h
+
+// Propagate the carry all the way to the end with h as extra carry word
+
+        testq   p, p
+        jz      bignum_cmnegadd_end
+
+        subq    h, (z,i,8)
+        movl    $0, hshort
+        incq    i
+        decq    p
+        jz      bignum_cmnegadd_highend
+
+bignum_cmnegadd_tloop:
+        sbbq    h, (z,i,8)
+        incq    i
+        decq    p
+        jnz     bignum_cmnegadd_tloop
+
+bignum_cmnegadd_highend:
+
+// Adjust the high word with the carry from subtraction
+
+        adcq    $0, h
+
+// Return the high/carry word
+
+bignum_cmnegadd_end:
+        movq    h, %rax
+
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmod.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmod.S
new file mode 100644
index 00000000000..91aa3f4d828
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmod.S
@@ -0,0 +1,223 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Find bignum modulo a single word
+// Input x[k], m; output function return
+//
+//    extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m);
+//
+// Returns x mod m, assuming m is nonzero.
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, RDX = m, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, R8 = m, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmod)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmod)
+        .text
+
+#define k %rdi
+#define x %rsi
+
+// This has to be %rcx for variable shifts
+
+#define e %rcx
+
+// We share the same variable for m and n, just shifting left then right.
+// And h is kept in %rdx which does work despite the special operands of mul.
+
+#define m %r8
+#define n %r8
+
+#define w %r9
+#define a %rax
+#define r %r10
+#define h %rdx
+#define l %r11
+
+#define ashort %eax
+#define hshort %edx
+
+S2N_BN_SYMBOL(bignum_cmod):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Initialize l = 0 now for convenience (we eventually need to do it).
+// If the bignum is zero-length, l is already the right answer of 0
+
+        xorq    l, l
+        testq   k, k
+        jz      bignum_cmod_end
+
+// Move m into its permanent home (also used for n).
+// Find number of leading zeros of m and let n = 2^e m so that for an
+// in-scope (nonzero) input m we have n >= 2^63, e <= 63.
+
+        movq    %rdx, m
+        bsrq    m, e
+        xorq    $63, e
+        shlq    %cl, m
+
+// A near-clone of word_recip so 2^64 + w = ceil(2^128 / n) - 1
+
+        movq   n, r
+        movq   $0x1FFFFFFFFFFFF, w
+        shrq   $16, r
+        xorq   r, w
+        incq   r
+        shrq   $32, w
+        movq   r, h
+        imulq  w, h
+        negq   h
+        movq   h, a
+        shrq   $49, a
+        imulq  a, a
+        shrq   $34, h
+        addq   a, h
+        orq    $0x40000000, a
+        imulq  h, a
+        shrq   $30, a
+        imulq  w, a
+        shlq   $30, w
+        addq   a, w
+        shrq   $30, w
+        movq   r, h
+        imulq  w, h
+        negq   h
+        shrq   $24, h
+        imulq  w, h
+        shlq   $16, w
+        shrq   $24, h
+        addq   h, w
+        movq   r, h
+        imulq  w, h
+        negq   h
+        shrq   $32, h
+        imulq  w, h
+        shlq   $31, w
+        shrq   $17, h
+        addq   h, w
+        movq   n, a
+        mulq   w
+        shrdq  $60, h, a
+        movq   w, h
+        shrq   $33, h
+        notq   a
+        imulq  h, a
+        shlq   $1, w
+        shrq   $33, a
+        addq   a, w
+        addq   $1, w
+        movq   n, a
+        sbbq   $0, w
+        mulq   w
+        addq   n, h
+        sbbq   $0, w
+
+// Take the residue r = 2^128 - (2^64 + w) * n, which by the above bound
+// we know fits in 64 bits. We know 2^128 == r (mod n) and hence (mod m).
+
+        movq    n, r
+        imulq   w, r
+        negq    r
+
+// Now just go down through the digits accumulating [h;l] == x (mod n)
+// by 2^64 * [h;l] + d = 2^128 * h + [l;d] == r * h + [l; d]. That addition
+// may overflow with a carry, say 2^128 + [h';l'] = r * h + [l; d], in
+// which case we subtract 2^128 - r (which is divisible by m and keeping
+// things in 128 bits we just add r). Thus the overall bound when we initially
+// overflow is r * h + [l; d] - (2^128 - r) = r * (h + 1) + [l; d] - 2^128
+// < 2^128 so we stay inside 2 words
+
+        xorl    hshort, hshort
+bignum_cmod_loop:
+        movq    h, a
+        mulq    r
+        addq    -8(x,k,8), a
+        adcq    l, h
+        movq    a, l
+        sbbq    a, a
+        andq    r, a
+        addq    a, l
+        adcq    $0, h
+        decq    k
+        jnz     bignum_cmod_loop
+
+// Now do reciprocal multiplication to reduce the 2-word modular equivalent
+// [h;l] to the single word l. If we assume the truncations are as follows
+//   2^64 + w = 2^128 / n - epsilon (0 <= epsilon <= 1)
+//   q = (w * h / 2^64) - delta (0 <= delta <= 1)
+// the net remainder is l + (h/2^64 * epsilon + delta) * n < l + 2 * n.
+// In general this needs two rounds of comparison to guarantee getting
+// into a single word (though one more mul could be used instead).
+// Also, the quotient estimate can overflow so we use r as extra addend
+// 2^64 * n when the initial addition overflows. The overall multiple
+// of n can't itself overflow, since we know it's an underestimate of
+// the initial residue.
+
+        movq    h, k // back up h for muls
+        movq    w, a
+        mulq    h
+        addq    k, h
+        sbbq    r, r
+        andq    n, r // So q = (r;h)
+
+        movq    h, a
+        mulq    n
+        addq    r, h
+        xorq    r, r
+        subq    a, l
+        sbbq    h, k // (k,l) = first reduction
+
+        cmovnzq n, r
+        xorl    ashort, ashort
+        subq    r, l
+        sbbq    a, k
+
+        cmovnzq n, a
+        subq    a, l
+
+// One more reciprocal multiplication to do a modular reduction, but now in
+// one word and in terms of the original m. For the quotient estimate we want
+// q = ((2^64 + w) * l) / 2^{128-e} = ((2^64 + w) * l) / 2^65 / 2^{63-e}.
+
+        movq    w, a
+        mulq    l
+        addq    l, h
+        rcr     $1, h
+
+        shrq    %cl, m
+        xorq    $63, e
+        shrq    %cl, h
+
+        imulq   m, h
+        subq    h, l
+
+// Note that since there is no neglected "low" part of the single word,
+// one round of correction suffices; in the analog of the above l = 0
+// and hence the residue so far is already < 2 * m.
+
+        movq    l, a
+        subq    m, l
+bignum_cmod_end:
+        cmovncq l, a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmul.S
new file mode 100644
index 00000000000..3a936011e17
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmul.S
@@ -0,0 +1,127 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word, z := c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_cmul
+//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := c * y" operation where y is n digits, result z is p.
+// Truncates the result in general unless p >= n + 1.
+//
+// The return value is a high/carry word that is meaningful when p >= n as
+// giving the high part of the result. Since this is always zero if p > n,
+// it is mainly of interest in the special case p = n, i.e. where the source
+// and destination have the same nominal size, when it gives the extra word
+// of the full result.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul)
+        .text
+
+#define p %rdi
+#define z %rsi
+#define c %r9
+#define n %rcx
+#define x %r8
+
+#define i %r10
+#define h %r11
+
+
+
+S2N_BN_SYMBOL(bignum_cmul):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output. Now we can
+// assume that n <= p
+
+        cmpq    n, p
+        cmovcq  p, n
+
+// Initialize current input/output pointer offset i and high part h.
+// But then if n = 0 skip the multiplication and go to the tail part
+
+        xorq    h, h
+        xorq    i, i
+        testq   n, n
+        jz      bignum_cmul_tail
+
+// Move c into a safer register as multiplies overwrite %rdx
+
+        movq    %rdx, c
+
+// Initialization of the loop: [h,l] = c * x_0
+
+        movq    (x), %rax
+        mulq    c
+        movq    %rax, (z)
+        movq    %rdx, h
+        incq    i
+        cmpq    n, i
+        jz      bignum_cmul_tail
+
+// Main loop doing the multiplications
+
+bignum_cmul_loop:
+        movq    (x,i,8), %rax
+        mulq    c
+        addq    h, %rax
+        adcq    $0, %rdx
+        movq    %rax, (z,i,8)
+        movq    %rdx, h
+        incq    i
+        cmpq    n, i
+        jc      bignum_cmul_loop
+
+// Add a tail when the destination is longer
+
+bignum_cmul_tail:
+        cmpq    p, i
+        jnc     bignum_cmul_end
+        movq    h, (z,i,8)
+        xorq    h, h
+        incq    i
+        cmpq    p, i
+        jnc     bignum_cmul_end
+
+bignum_cmul_tloop:
+        movq    h, (z,i,8)
+        incq    i
+        cmpq    p, i
+        jc      bignum_cmul_tloop
+
+// Return the high/carry word
+
+bignum_cmul_end:
+        movq    h, %rax
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_coprime.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_coprime.S
new file mode 100644
index 00000000000..442b84b8c15
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_coprime.S
@@ -0,0 +1,518 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignums for coprimality, gcd(x,y) = 1
+// Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)]
+//
+//    extern uint64_t bignum_coprime
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t);
+//
+// Test for whether two bignums are coprime (no common factor besides 1).
+// This is equivalent to testing if their gcd is 1, but a bit faster than
+// doing those two computations separately.
+//
+// Here bignum x is m digits long, y is n digits long and the temporary
+// buffer t needs to be 2 * max(m,n) digits long. The return value is
+// 1 if coprime(x,y) and 0 otherwise.
+//
+// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, R8 = t, returns RAX
+// Microsoft x64 ABI:   RCX = m, RDX = x, R8 = n, R9 = y, [RSP+40] = t, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_coprime)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_coprime)
+        .text
+
+// We get CHUNKSIZE bits per outer iteration, 64 minus a bit for proxy errors
+
+#define CHUNKSIZE 58
+
+// These variables are so fundamental we keep them consistently in registers.
+// m is in fact the temporary buffer argument w so use the same register
+
+#define m %r8
+#define n %r15
+#define k %r14
+#define l %r13
+
+// These are kept on the stack since there aren't enough registers
+
+#define mat_mm      (%rsp)
+#define mat_mn      8(%rsp)
+#define mat_nm      16(%rsp)
+#define mat_nn      24(%rsp)
+#define t           32(%rsp)
+#define evenor      40(%rsp)
+
+#define STACKVARSIZE 48
+
+// These are shorthands for common temporary register
+
+#define a %rax
+#define b %rbx
+#define c %rcx
+#define d %rdx
+#define i %r9
+
+// Temporaries for the top proxy selection part
+
+#define c1        %r10
+#define c2        %r11
+#define h1        %r12
+#define h2        %rbp
+#define l1        %rdi
+#define l2        %rsi
+
+// Re-use for the actual proxies; m_hi = h1 and n_hi = h2 are assumed
+
+#define m_hi    %r12
+#define n_hi    %rbp
+#define m_lo    %rdi
+#define n_lo    %rsi
+
+// Re-use for the matrix entries in the inner loop, though they
+// get spilled to the corresponding memory locations mat_...
+
+#define m_m     %r10
+#define m_n     %r11
+#define n_m     %rcx
+#define n_n     %rdx
+
+#define ishort   %r9d
+#define m_mshort %r10d
+#define m_nshort %r11d
+#define n_mshort %ecx
+#define n_nshort %edx
+
+// Because they are so unmemorable
+
+#define arg1 %rdi
+#define arg2 %rsi
+#define arg3 %rdx
+#define arg4 %rcx
+
+S2N_BN_SYMBOL(bignum_coprime):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Save all required registers and make room on stack for all the above vars
+
+        pushq   %rbp
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $STACKVARSIZE, %rsp
+
+// Compute k = max(m,n), and if this is zero skip to the end. Note that
+// in this case k is also in %rax so serves as the right answer of "false"
+
+        movq    arg1, %rax
+        cmpq    arg3, %rax
+        cmovcq  arg3, %rax
+        movq    %rax, k
+
+        testq   %rax, %rax
+        jz      bignum_coprime_end
+
+// Set up inside w two size-k buffers m and n
+
+        leaq    (m,k,8), n
+
+// Copy the input x into the buffer m, padding with zeros as needed
+
+        xorq    i, i
+        testq   arg1, arg1
+        jz      bignum_coprime_xpadloop
+bignum_coprime_xloop:
+        movq    (arg2,i,8), a
+        movq    a, (m,i,8)
+        incq    i
+        cmpq    arg1, i
+        jc      bignum_coprime_xloop
+        cmpq    k, i
+        jnc     bignum_coprime_xskip
+bignum_coprime_xpadloop:
+        movq    $0,  (m,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_coprime_xpadloop
+bignum_coprime_xskip:
+
+// Copy the input y into the buffer n, padding with zeros as needed
+
+        xorq    i, i
+        testq   arg3, arg3
+        jz      bignum_coprime_ypadloop
+bignum_coprime_yloop:
+        movq    (arg4,i,8), a
+        movq    a, (n,i,8)
+        incq    i
+        cmpq    arg3, i
+        jc      bignum_coprime_yloop
+        cmpq    k, i
+        jnc     bignum_coprime_yskip
+bignum_coprime_ypadloop:
+        movq    $0,  (n,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_coprime_ypadloop
+bignum_coprime_yskip:
+
+// Set up the outer loop count of 64 * sum of input sizes.
+// The invariant is that m * n < 2^t at all times.
+
+        leaq    (arg1,arg3), a
+        shlq    $6, a
+        movq    a, t
+
+// Record for the very end the OR of the lowest words.
+// If the bottom bit is zero we know both are even so the answer is false.
+// But since this is constant-time code we still execute all the main part.
+
+        movq    (m), a
+        movq    (n), b
+        orq     b, a
+        movq    a, evenor
+
+// Now if n is even trigger a swap of m and n. This ensures that if
+// one or other of m and n is odd then we make sure now that n is,
+// as expected by our invariant later on.
+
+        andq    $1, b
+        subq    $1, b
+
+        xorq    i, i
+bignum_coprime_swaploop:
+        movq    (m,i,8), a
+        movq    (n,i,8), c
+        movq    a, d
+        xorq    c, d
+        andq    b, d
+        xorq    d, a
+        xorq    d, c
+        movq    a, (m,i,8)
+        movq    c, (n,i,8)
+        incq    i
+        cmpq    k, i
+        jnz     bignum_coprime_swaploop
+
+// Start of the main outer loop iterated t / CHUNKSIZE times
+
+bignum_coprime_outerloop:
+
+// We need only bother with sharper l = min k (ceil(t/64)) digits
+// Either both m and n fit in l digits, or m has become zero and so
+// nothing happens in the loop anyway and this makes no difference.
+
+        movq    t, l
+        addq    $63, l
+        shrq    $6, l
+        cmpq    k, l
+        cmovncq k, l
+
+// Select upper and lower proxies for both m and n to drive the inner
+// loop. The lower proxies are simply the lowest digits themselves,
+// m_lo = m[0] and n_lo = n[0], while the upper proxies are bitfields
+// of the two inputs selected so their top bit (63) aligns with the
+// most significant bit of *either* of the two inputs.
+
+        xorq    h1, h1 // Previous high and low for m
+        xorq    l1, l1
+        xorq    h2, h2 // Previous high and low for n
+        xorq    l2, l2
+        xorq    c2, c2 // Mask flag: previous word of one was nonzero
+        // and in this case h1 and h2 are those words
+
+        xorq    i, i
+bignum_coprime_toploop:
+        movq    (m,i,8), b
+        movq    (n,i,8), c
+        movq    c2, c1
+        andq    h1, c1
+        andq    h2, c2
+        movq    b, a
+        orq     c, a
+        negq    a
+        cmovcq  c1, l1
+        cmovcq  c2, l2
+        cmovcq  b, h1
+        cmovcq  c, h2
+        sbbq    c2, c2
+        incq    i
+        cmpq    l, i
+        jc      bignum_coprime_toploop
+
+        movq    h1, a
+        orq     h2, a
+        bsrq    a, c
+        xorq    $63, c
+        shldq   %cl, l1, h1
+        shldq   %cl, l2, h2
+
+// m_lo = m[0], n_lo = n[0];
+
+        movq    (m), %rax
+        movq    %rax, m_lo
+
+        movq    (n), %rax
+        movq    %rax, n_lo
+
+// Now the inner loop, with i as loop counter from CHUNKSIZE down.
+// This records a matrix of updates to apply to the initial
+// values of m and n with, at stage j:
+//
+//     sgn * m' = (m_m * m - m_n * n) / 2^j
+//    -sgn * n' = (n_m * m - n_n * n) / 2^j
+//
+// where "sgn" is either +1 or -1, and we lose track of which except
+// that both instance above are the same. This throwing away the sign
+// costs nothing (since we have to correct in general anyway because
+// of the proxied comparison) and makes things a bit simpler. But it
+// is simply the parity of the number of times the first condition,
+// used as the swapping criterion, fires in this loop.
+
+        movl    $1, m_mshort
+        movl    $0, m_nshort
+        movl    $0, n_mshort
+        movl    $1, n_nshort
+        movl    $CHUNKSIZE, ishort
+
+// Stash more variables over the inner loop to free up regs
+
+        movq    k, mat_mn
+        movq    l, mat_nm
+        movq    m, mat_mm
+        movq    n, mat_nn
+
+// Conceptually in the inner loop we follow these steps:
+//
+// * If m_lo is odd and m_hi < n_hi, then swap the four pairs
+//    (m_hi,n_hi); (m_lo,n_lo); (m_m,n_m); (m_n,n_n)
+//
+// * Now, if m_lo is odd (old or new, doesn't matter as initial n_lo is odd)
+//    m_hi := m_hi - n_hi, m_lo := m_lo - n_lo
+//    m_m  := m_m + n_m, m_n := m_n + n_n
+//
+// * Halve and double them
+//     m_hi := m_hi / 2, m_lo := m_lo / 2
+//     n_m := n_m * 2, n_n := n_n * 2
+//
+// The actual computation computes updates before actually swapping and
+// then corrects as needed.
+
+bignum_coprime_innerloop:
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorq    m, m
+        xorq    n, n
+        btq     $0, m_lo
+
+        cmovcq  n_hi, %rax
+        cmovcq  n_lo, %rbx
+        cmovcq  n_m, m
+        cmovcq  n_n, n
+
+        movq    m_lo, l
+        subq    %rbx, m_lo
+        subq    l, %rbx
+        movq    m_hi, k
+        subq    %rax, k
+        cmovcq  m_hi, n_hi
+        leaq    -1(k), m_hi
+        cmovcq  %rbx, m_lo
+        cmovcq  l, n_lo
+        notq    m_hi
+        cmovcq  m_m, n_m
+        cmovcq  m_n, n_n
+        cmovncq k, m_hi
+
+        shrq    $1, m_lo
+        addq    m, m_m
+        addq    n, m_n
+        shrq    $1, m_hi
+        addq    n_m, n_m
+        addq    n_n, n_n
+
+// End of the inner for-loop
+
+        decq    i
+        jnz     bignum_coprime_innerloop
+
+// Unstash the temporary variables
+
+        movq    mat_mn, k
+        movq    mat_nm, l
+        movq    mat_mm, m
+        movq    mat_nn, n
+
+// Put the matrix entries in memory since we're out of registers
+// We pull them out repeatedly in the next loop
+
+        movq    m_m, mat_mm
+        movq    m_n, mat_mn
+        movq    n_m, mat_nm
+        movq    n_n, mat_nn
+
+// Now actually compute the updates to m and n corresponding to that matrix,
+// and correct the signs if they have gone negative. First we compute the
+// (k+1)-sized updates with the following invariant (here h1 and h2 are in
+// fact carry bitmasks, either 0 or -1):
+//
+//    h1::l1::m = m_m * m - m_n * n
+//    h2::l2::n = n_m * m - n_n * n
+
+        xorq    i, i
+        xorq    h1, h1
+        xorq    l1, l1
+        xorq    h2, h2
+        xorq    l2, l2
+bignum_coprime_crossloop:
+
+        movq    (m,i,8), c
+        movq    mat_mm, a
+        mulq    c
+        addq    a, l1
+        adcq    $0, d
+        movq    d, c1 // Now c1::l1 is +ve part 1
+
+        movq    mat_nm, a
+        mulq    c
+        addq    a, l2
+        adcq    $0, d
+        movq    d, c2 // Now c2::l2 is +ve part 2
+
+        movq    (n,i,8), c
+        movq    mat_mn, a
+        mulq    c
+        subq    h1, d // Now d::a is -ve part 1
+
+        subq    a, l1
+        sbbq    d, c1
+        sbbq    h1, h1
+        movq    l1, (m,i,8)
+        movq    c1, l1
+
+        movq    mat_nn, a
+        mulq    c
+        subq    h2, d // Now d::a is -ve part 2
+
+        subq    a, l2
+        sbbq    d, c2
+        sbbq    h2, h2
+        movq    l2, (n,i,8)
+        movq    c2, l2
+
+        incq    i
+        cmpq    l, i
+        jc      bignum_coprime_crossloop
+
+// Now fix the signs of m and n if they have gone negative
+
+        xorq    i, i
+        movq    h1, c1 // carry-in coded up as well
+        movq    h2, c2 // carry-in coded up as well
+        xorq    h1, l1 // for the bignum_coprime_end digit
+        xorq    h2, l2 // for the bignum_coprime_end digit
+bignum_coprime_optnegloop:
+        movq    (m,i,8), a
+        xorq    h1, a
+        negq    c1
+        adcq    $0, a
+        sbbq    c1, c1
+        movq    a, (m,i,8)
+        movq    (n,i,8), a
+        xorq    h2, a
+        negq    c2
+        adcq    $0, a
+        sbbq    c2, c2
+        movq    a, (n,i,8)
+        incq    i
+        cmpq    l, i
+        jc      bignum_coprime_optnegloop
+        subq    c1, l1
+        subq    c2, l2
+
+// Now shift them right CHUNKSIZE bits
+
+        movq    l, i
+bignum_coprime_shiftloop:
+        movq    -8(m,i,8), a
+        movq    a, h1
+        shrdq   $CHUNKSIZE, l1, a
+        movq    a, -8(m,i,8)
+        movq    h1, l1
+        movq    -8(n,i,8), a
+        movq    a, h2
+        shrdq   $CHUNKSIZE, l2, a
+        movq    a, -8(n,i,8)
+        movq    h2, l2
+        decq    i
+        jnz     bignum_coprime_shiftloop
+
+// End of main loop. We can stop if t' <= 0 since then m * n < 2^0, which
+// since n is odd (in the main cases where we had one or other input odd)
+// means that m = 0 and n is the final gcd. Moreover we do in fact need to
+// maintain strictly t > 0 in the main loop, or the computation of the
+// optimized digit bound l could collapse to 0.
+
+        subq    $CHUNKSIZE, t
+        jnbe    bignum_coprime_outerloop
+
+// Now compare n with 1 (OR of the XORs in a)
+
+        movq    (n), a
+        xorq    $1, a
+        cmpq    $1, k
+        jz      bignum_coprime_finalcomb
+        movl    $1, ishort
+bignum_coprime_compareloop:
+        orq     (n,i,8), a
+        incq    i
+        cmpq    k, i
+        jc      bignum_coprime_compareloop
+
+// Now combine that with original "evenor" oddness flag
+// The final condition is lsb(evenor) = 1 AND a = 0
+
+bignum_coprime_finalcomb:
+        negq    a
+        sbbq    a, a
+        incq    a
+        andq    evenor, a
+
+// The end
+
+bignum_coprime_end:
+        addq    $STACKVARSIZE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_copy.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_copy.S
new file mode 100644
index 00000000000..50d7906e7f4
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_copy.S
@@ -0,0 +1,83 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Copy bignum with zero-extension or truncation, z := x
+// Input x[n]; output z[k]
+//
+//    extern void bignum_copy
+//      (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define n %rdx
+#define x %rcx
+
+#define i %r8
+#define a %rax
+
+
+
+S2N_BN_SYMBOL(bignum_copy):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Replace RDX = n with RDX = min(k,n) so we are definitely safe copying those
+// Initialize the element counter to 0
+
+        cmpq    n, k
+        cmovcq  k, n
+        xorq    i, i
+
+// If min(k,n) = 0 jump to the padding stage
+
+        testq   n, n
+        jz      bignum_copy_padding
+
+bignum_copy_copyloop:
+        movq    (x,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    n, i
+        jc      bignum_copy_copyloop
+
+bignum_copy_padding:
+        cmpq    k, i
+        jnc     bignum_copy_end
+        xorq    a, a
+
+bignum_copy_padloop:
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_copy_padloop
+
+bignum_copy_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctd.S
new file mode 100644
index 00000000000..954f386bb66
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctd.S
@@ -0,0 +1,71 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count trailing zero digits (64-bit words)
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_ctd (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is k
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ctd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ctd)
+        .text
+
+#define k %rdi
+#define x %rsi
+#define i %rdx
+#define a %rax
+
+
+
+S2N_BN_SYMBOL(bignum_ctd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// If the bignum is zero-length, just return 0
+
+        xorq    %rax, %rax
+        testq   k, k
+        jz      bignum_ctd_end
+
+// Record in i that the lowest nonzero word is i - 1, where i = k + 1 means
+// that the bignum was entirely zero
+
+        movq    k, i
+        incq    i
+bignum_ctd_loop:
+        movq    -8(x,k,8), a
+        testq   a, a
+        cmovneq k, i
+        decq    k
+        jnz     bignum_ctd_loop
+
+// We now want to return i - 1
+
+        decq    i
+        movq    i, %rax
+bignum_ctd_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctz.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctz.S
new file mode 100644
index 00000000000..5dd61099564
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctz.S
@@ -0,0 +1,87 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count trailing zero bits
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_ctz (uint64_t k, uint64_t *x);
+//
+//
+// In the case of a zero bignum as input the result is 64 * k
+//
+// In principle this has a precondition k < 2^58, but obviously that
+// is always true in practice because of address space limitations
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ctz)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ctz)
+        .text
+
+#define k %rdi
+#define x %rsi
+#define i %rdx
+#define w %rcx
+#define a %rax
+
+#define wshort %ecx
+
+
+
+S2N_BN_SYMBOL(bignum_ctz):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// If the bignum is zero-length, just return 0
+
+        xorq    %rax, %rax
+        testq   k, k
+        jz      bignum_ctz_end
+
+// Use w = a[i-1] to store nonzero words in a top-down sweep
+// Set the initial default to be as if we had a 1 word directly above
+
+        movq    k, i
+        incq    i
+        movl    $1, wshort
+
+bignum_ctz_loop:
+        movq    -8(x,k,8), a
+        testq   a, a
+        cmovneq k, i
+        cmovneq a, w
+        decq    k
+        jnz     bignum_ctz_loop
+
+// Now w = a[i-1] is the lowest nonzero word, or in the zero case the
+// default of the "extra" 1 = a[k]. We now want 64*(i-1) + ctz(w).
+// Note that this code does not rely on the behavior of the BSF instruction
+// for zero inputs, which is undefined according to the manual.
+
+        decq    i
+        shlq    $6, i
+        bsfq    w, %rax
+        addq    i, %rax
+
+bignum_ctz_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_demont.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_demont.S
new file mode 100644
index 00000000000..ee9ca9cfa5c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_demont.S
@@ -0,0 +1,204 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m
+// Inputs x[k], m[k]; output z[k]
+//
+//    extern void bignum_demont
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+//
+// Does z := (x / 2^{64k}) mod m, hence mapping out of Montgomery domain.
+// In other words, this is a k-fold Montgomery reduction with same-size input.
+// This can handle almost-Montgomery inputs, i.e. any k-digit bignum.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define x %rdx
+#define m %rcx
+
+// General temp, low part of product and mul input
+#define a %rax
+// General temp, high part of product (no longer x)
+#define b %rdx
+// Negated modular inverse
+#define w %r8
+// Outer loop counter
+#define i %r9
+// Inner loop counter
+#define j %rbx
+// Home for Montgomery multiplier
+#define d %rbp
+#define h %r10
+#define e %r11
+#define n %r12
+
+// A temp reg in the initial word-level negmodinv, same as j
+
+#define t %rbx
+
+#define ashort %eax
+#define jshort %ebx
+
+
+S2N_BN_SYMBOL(bignum_demont):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save registers
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_demont_end
+
+// Compute word-level negated modular inverse w for m == m[0].
+
+        movq    (m), a
+
+        movq    a, t
+        movq    a, w
+        shlq    $2, t
+        subq    t, w
+        xorq    $2, w
+
+        movq    w, t
+        imulq   a, t
+        movl    $2, ashort
+        addq    t, a
+        addq    $1, t
+
+        imulq   a, w
+
+        imulq   t, t
+        movl    $1, ashort
+        addq    t, a
+        imulq   a, w
+
+        imulq   t, t
+        movl    $1, ashort
+        addq    t, a
+        imulq   a, w
+
+        imulq   t, t
+        movl    $1, ashort
+        addq    t, a
+        imulq   a, w
+
+// Initially just copy the input to the output. It would be a little more
+// efficient but somewhat fiddlier to tweak the zeroth iteration below instead.
+// After this we never use x again and can safely recycle RDX for muls
+
+        xorq    j, j
+bignum_demont_iloop:
+        movq    (x,j,8), a
+        movq    a, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_demont_iloop
+
+// Outer loop, just doing a standard Montgomery reduction on z
+
+        xorq    i, i
+
+bignum_demont_outerloop:
+        movq    (z), e
+        movq    w, d
+        imulq   e, d
+        movq    (m), a
+        mulq    d
+        addq    e, a // Will be zero but want the carry
+        movq    %rdx, h
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_demont_montend
+
+bignum_demont_montloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (m,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, -8(z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    n
+        jnz     bignum_demont_montloop
+
+bignum_demont_montend:
+        adcq    $0, h
+        movq    h, -8(z,j,8)
+
+// End of outer loop.
+
+        incq    i
+        cmpq    k, i
+        jc      bignum_demont_outerloop
+
+// Now do a comparison of z with m to set a final correction mask
+// indicating that z >= m and so we need to subtract m.
+
+        xorq    j, j
+        movq    k, n
+bignum_demont_cmploop:
+        movq    (z,j,8), a
+        sbbq    (m,j,8), a
+        incq    j
+        decq    n
+        jnz     bignum_demont_cmploop
+        sbbq    d, d
+        notq    d
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        xorq    e, e
+        xorq    j, j
+bignum_demont_corrloop:
+        movq    (m,j,8), a
+        andq    d, a
+        negq    e
+        sbbq    a, (z,j,8)
+        sbbq    e, e
+        incq    j
+        cmpq    k, j
+        jc      bignum_demont_corrloop
+
+bignum_demont_end:
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digit.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digit.S
new file mode 100644
index 00000000000..3e41e61b9c6
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digit.S
@@ -0,0 +1,72 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Select digit x[n]
+// Inputs x[k], n; output function return
+//
+//    extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n);
+//
+// n'th digit of a k-digit (digit=64 bits) bignum, in constant-time style.
+// Indexing starts at 0, which is the least significant digit (little-endian).
+// Returns zero if n >= k, i.e. we read a digit off the end of the bignum.
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, RDX = n, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, R8 = n, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_digit)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_digit)
+        .text
+
+#define k %rdi
+#define x %rsi
+#define n %rdx
+
+#define d %rax
+#define i %rcx
+#define a %r8
+
+S2N_BN_SYMBOL(bignum_digit):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Set the default digit to 0, and for length zero finish immediately
+
+        xorq    d, d
+        testq   k, k
+        jz      bignum_digit_end
+
+// Main loop: run over all the digits and take note of the n'th one
+
+        xorq    i, i
+bignum_digit_loop:
+        movq    (x,i,8), a
+        cmpq    n, i
+        cmovzq  a, d
+        incq    i
+        cmpq    k, i
+        jc      bignum_digit_loop
+
+// Return
+
+bignum_digit_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digitsize.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digitsize.S
new file mode 100644
index 00000000000..a1902b6b4f5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digitsize.S
@@ -0,0 +1,70 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return size of bignum in digits (64-bit word)
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x);
+//
+// In the case of a zero bignum as input the result is 0
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_digitsize)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_digitsize)
+        .text
+
+#define k %rdi
+#define x %rsi
+#define i %rax
+#define a %rcx
+#define j %rdx
+
+
+
+S2N_BN_SYMBOL(bignum_digitsize):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Initialize the index i and also prepare default return value of 0 (i = %rax)
+
+        xorq    i, i
+
+// If the bignum is zero-length, just return 0
+
+        testq   k, k
+        jz      bignum_digitsize_end
+
+// Run over the words j = 0..i-1, and set i := j + 1 when hitting nonzero a[j]
+
+        xorq    j, j
+bignum_digitsize_loop:
+        movq    (x,j,8), a
+        incq    j
+        testq   a, a
+        cmovnzq j, i
+        cmpq    k, j
+        jnz     bignum_digitsize_loop
+
+bignum_digitsize_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_divmod10.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_divmod10.S
new file mode 100644
index 00000000000..14bbc9e37dd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_divmod10.S
@@ -0,0 +1,98 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Divide bignum by 10: z' := z div 10, returning remainder z mod 10
+// Inputs z[k]; outputs function return (remainder) and z[k]
+//
+//    extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_divmod10)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_divmod10)
+        .text
+
+#define k %rdi
+#define z %rsi
+
+#define d %rcx
+
+#define l %rdx
+#define r %rax
+
+#define q %r8
+#define h %r8
+
+#define s %r9
+#define w %r10
+
+#define rshort %eax
+#define wshort %r10d
+
+S2N_BN_SYMBOL(bignum_divmod10):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Initialize remainder to 0 and if k = 0 return
+
+        xorl    rshort, rshort
+        testq   k, k
+        jz      bignum_divmod10_end
+
+// Straightforward top-down loop doing 10 * q + r' := 2^64 * r + d
+
+        movq    $0x3333333333333334, s
+        movl    $0x3333333, wshort
+
+bignum_divmod10_divloop:
+        movq    -8(z,k,8), d
+
+// First re-split and shift so 2^28 * h + l = (2^64 * r + d) / 2
+// Then (2^64 * r + d) / 10 = [(2^28 - 1) / 5] * h + (h + l) / 5
+
+        movq    d, l
+        shlq    $35, l
+        shldq   $35, d, r
+        shrq    $36, l
+        movq    r, h
+
+        addq    l, r
+        mulq    s
+        imulq   w, h
+        addq    l, q
+        movq    q, -8(z,k,8)
+
+// Generate the new remainder r = d - 10 * q
+// Since r <= 9 we only need the low part computation ignoring carries
+
+        leaq    (q,q,4), q
+        negq    q
+        leaq    (d,q,2), r
+
+        decq    k
+        jnz     bignum_divmod10_divloop
+
+// Return %rax = r as the final remainder
+
+bignum_divmod10_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_emontredc.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_emontredc.S
new file mode 100644
index 00000000000..e39905a1fa1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_emontredc.S
@@ -0,0 +1,155 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Extended Montgomery reduce, returning results in input-output buffer
+// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k]
+//
+//    extern uint64_t bignum_emontredc
+//     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w);
+//
+// Assumes that z initially holds a 2k-digit bignum z_0, m is a k-digit odd
+// bignum and m * w == -1 (mod 2^64). This function also uses z for the output
+// as well as returning a carry c of 0 or 1. This encodes two numbers: in the
+// lower half of the z buffer we have q = z[0..k-1], while the upper half
+// together with the carry gives r = 2^{64k}*c + z[k..2k-1]. These values
+// satisfy z_0 + q * m = 2^{64k} * r, i.e. r gives a raw (unreduced) Montgomery
+// reduction while q gives the multiplier that was used. Another way of
+// thinking of it is that if z' is the output z with the lower half replaced
+// with zeros, then z_0 + q * m = 2^{128k} * c + z'.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = w, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = w, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc)
+        .text
+
+// Argument m comes in in %rdx but we copy it to %r8
+
+#define k %rdi
+#define z %rsi
+#define m %r8
+#define w %rcx
+
+// General temp, low part of product and mul input
+#define a %rax
+// General temp, High part of product
+#define b %rdx
+// Home for i'th digit or Montgomery multiplier
+#define d %rbx
+
+// Outer loop counter
+#define i %r9
+// Inner loop counter
+#define j %r10
+
+#define h %r11
+#define e %r12
+#define t %r13
+#define c %r14
+
+#define cshort %r14d
+#define jshort %r10d
+
+
+S2N_BN_SYMBOL(bignum_emontredc):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save registers
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+
+// Initialize top carry to zero immediately to catch the k = 0 case
+
+        xorq    c, c
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_emontredc_end
+
+// Move m into its permanent home since we need RDX for muls
+
+        movq    %rdx, m
+
+// Launch into the outer loop
+
+        xorq    i, i
+bignum_emontredc_outerloop:
+        movq    (z), e
+        movq    w, d
+        imulq   e, d
+        movq    (m), a
+        mulq    d
+        movq    d, (z)
+        addq    e, a // Will be zero but want the carry
+        movq    %rdx, h
+        movl    $1, jshort
+        movq    k, t
+        decq    t
+        jz      bignum_emontredc_montend
+
+bignum_emontredc_montloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (m,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, (z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    t
+        jnz     bignum_emontredc_montloop
+
+bignum_emontredc_montend:
+        adcq    c, h
+        movl    $0, cshort
+        adcq    $0, c
+        movq    (z,k,8), a
+        addq    h, a
+        movq    a, (z,k,8)
+        adcq    $0, c
+
+// End of outer loop.
+
+        addq    $8, z // For simple indexing, z pointer moves
+        incq    i
+        cmpq    k, i
+        jc      bignum_emontredc_outerloop
+
+bignum_emontredc_end:
+
+// Put the top carry in the expected place, restore registers and return
+
+        movq    c, %rax
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_eq.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_eq.S
new file mode 100644
index 00000000000..5e6f4f82e28
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_eq.S
@@ -0,0 +1,97 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignums for equality, x = y
+// Inputs x[m], y[n]; output function return
+//
+//    extern uint64_t bignum_eq
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX
+// Microsoft x64 ABI:   RCX = m, RDX = x, R8 = n, R9 = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_eq)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_eq)
+        .text
+
+#define m %rdi
+#define x %rsi
+#define n %rdx
+#define y %rcx
+#define c %rax
+// We can re-use n for this, not needed when d appears
+#define d %rdx
+
+S2N_BN_SYMBOL(bignum_eq):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Initialize the accumulated OR of differences to zero
+
+        xorq    c, c
+
+// If m >= n jump into the m > n loop at the final equality test
+// This will drop through for m = n
+
+        cmpq    n, m
+        jnc     bignum_eq_mtest
+
+// Toploop for the case n > m
+
+bignum_eq_nloop:
+        decq    n
+        orq     (y,n,8), c
+        cmpq    n, m
+        jnz     bignum_eq_nloop
+        jmp     bignum_eq_mmain
+
+// Toploop for the case m > n (or n = m which enters at "mtest")
+
+bignum_eq_mloop:
+        decq    m
+        orq     (x,m,8), c
+        cmpq    n, m
+bignum_eq_mtest:
+        jnz     bignum_eq_mloop
+
+// Combined main loop for the min(m,n) lower words
+
+bignum_eq_mmain:
+        testq   m, m
+        jz      bignum_eq_end
+
+bignum_eq_loop:
+        movq    -8(x,m,8), d
+        xorq    -8(y,m,8), d
+        orq     d, c
+        decq    m
+        jnz     bignum_eq_loop
+
+// Set a standard C condition based on whether c is nonzero
+
+bignum_eq_end:
+        negq    c
+        sbbq    c, c
+        incq    c
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_even.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_even.S
new file mode 100644
index 00000000000..2f66295cc81
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_even.S
@@ -0,0 +1,52 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignum for even-ness
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_even (uint64_t k, uint64_t *x);
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_even)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_even)
+        .text
+
+S2N_BN_SYMBOL(bignum_even):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Set default return value of 1 and finish if k = 0 (trivially even)
+
+        movl    $1, %eax
+        testq   %rdi, %rdi
+        jz      bignum_even_end
+
+// Otherwise XOR that initial 1 with the lowest bit of the input
+
+        xorq    (%rsi), %rax
+        andq    $1, %rax
+
+bignum_even_end:
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ge.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ge.S
new file mode 100644
index 00000000000..ccc237e565b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ge.S
@@ -0,0 +1,111 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compare bignums, x >= y
+// Inputs x[m], y[n]; output function return
+//
+//    extern uint64_t bignum_ge
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX
+// Microsoft x64 ABI:   RCX = m, RDX = x, R8 = n, R9 = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ge)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ge)
+        .text
+
+#define m %rdi
+#define x %rsi
+#define n %rdx
+#define y %rcx
+#define i %r8
+#define a %rax
+
+#define ashort %eax
+
+
+
+S2N_BN_SYMBOL(bignum_ge):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Zero the main index counter for both branches
+
+        xorq    i, i
+
+// Speculatively form m := m - n and do case split
+
+        subq    n, m
+        jc      bignum_ge_ylonger
+
+// The case where x is longer or of the same size (m >= n)
+
+        incq    m
+        testq   n, n
+        jz      bignum_ge_xtest
+bignum_ge_xmainloop:
+        movq    (x,i,8), a
+        sbbq    (y,i,8), a
+        incq    i
+        decq    n
+        jnz     bignum_ge_xmainloop
+        jmp     bignum_ge_xtest
+bignum_ge_xtoploop:
+        movq    (x,i,8), a
+        sbbq    $0, a
+        incq    i
+bignum_ge_xtest:
+        decq    m
+        jnz     bignum_ge_xtoploop
+        sbbq    a, a
+        incq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+// The case where y is longer (n > m)
+
+bignum_ge_ylonger:
+        addq    n, m
+        subq    m, n
+        testq   m, m
+        jz      bignum_ge_ytoploop
+bignum_ge_ymainloop:
+        movq    (x,i,8), a
+        sbbq    (y,i,8), a
+        incq    i
+        decq    m
+        jnz     bignum_ge_ymainloop
+bignum_ge_ytoploop:
+        movl    $0, ashort
+        sbbq    (y,i,8), a
+        incq    i
+        decq    n
+        jnz     bignum_ge_ytoploop
+
+        sbbq    a, a
+        incq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_gt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_gt.S
new file mode 100644
index 00000000000..e2673ad390e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_gt.S
@@ -0,0 +1,111 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compare bignums, x > y
+// Inputs x[m], y[n]; output function return
+//
+//    extern uint64_t bignum_gt
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX
+// Microsoft x64 ABI:   RCX = m, RDX = x, R8 = n, R9 = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_gt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_gt)
+        .text
+
+#define m %rdi
+#define x %rsi
+#define n %rdx
+#define y %rcx
+#define i %r8
+#define a %rax
+
+#define ashort %eax
+
+
+
+S2N_BN_SYMBOL(bignum_gt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Zero the main index counter for both branches
+
+        xorq    i, i
+
+// Speculatively form n := n - m and do case split
+
+        subq    m, n
+        jc      bignum_gt_ylonger
+
+// The case where y is longer or of the same size (n >= m)
+
+        incq    n
+        testq   m, m
+        jz      bignum_gt_xtest
+bignum_gt_xmainloop:
+        movq    (y,i,8), a
+        sbbq    (x,i,8), a
+        incq    i
+        decq    m
+        jnz     bignum_gt_xmainloop
+        jmp     bignum_gt_xtest
+bignum_gt_xtoploop:
+        movq    (y,i,8), a
+        sbbq    $0, a
+        incq    i
+bignum_gt_xtest:
+        decq    n
+        jnz     bignum_gt_xtoploop
+        sbbq    a, a
+        negq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+// The case where x is longer (m > n)
+
+bignum_gt_ylonger:
+        addq    m, n
+        subq    n, m
+        testq   n, n
+        jz      bignum_gt_ytoploop
+bignum_gt_ymainloop:
+        movq    (y,i,8), a
+        sbbq    (x,i,8), a
+        incq    i
+        decq    n
+        jnz     bignum_gt_ymainloop
+bignum_gt_ytoploop:
+        movl    $0, ashort
+        sbbq    (x,i,8), a
+        incq    i
+        decq    m
+        jnz     bignum_gt_ytoploop
+
+        sbbq    a, a
+        negq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_iszero.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_iszero.S
new file mode 100644
index 00000000000..f33b8fc714e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_iszero.S
@@ -0,0 +1,58 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignum for zero-ness, x = 0
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_iszero (uint64_t k, uint64_t *x);
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_iszero)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_iszero)
+        .text
+
+#define a %rax
+#define k %rdi
+#define x %rsi
+
+S2N_BN_SYMBOL(bignum_iszero):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+        xorq    a, a
+        testq   k, k
+        jz      bignum_iszero_end
+
+bignum_iszero_loop:
+        orq     -8(x,k,8), a
+        decq    k
+        jnz     bignum_iszero_loop
+
+// Set a standard C condition based on whether a is nonzero
+
+        negq    a
+        sbbq    a, a
+
+bignum_iszero_end:
+        incq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_le.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_le.S
new file mode 100644
index 00000000000..114755d29d1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_le.S
@@ -0,0 +1,111 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compare bignums, x <= y
+// Inputs x[m], y[n]; output function return
+//
+//    extern uint64_t bignum_le
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX
+// Microsoft x64 ABI:   RCX = m, RDX = x, R8 = n, R9 = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_le)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_le)
+        .text
+
+#define m %rdi
+#define x %rsi
+#define n %rdx
+#define y %rcx
+#define i %r8
+#define a %rax
+
+#define ashort %eax
+
+
+
+S2N_BN_SYMBOL(bignum_le):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Zero the main index counter for both branches
+
+        xorq    i, i
+
+// Speculatively form n := n - m and do case split
+
+        subq    m, n
+        jc      bignum_le_ylonger
+
+// The case where y is longer or of the same size (n >= m)
+
+        incq    n
+        testq   m, m
+        jz      bignum_le_xtest
+bignum_le_xmainloop:
+        movq    (y,i,8), a
+        sbbq    (x,i,8), a
+        incq    i
+        decq    m
+        jnz     bignum_le_xmainloop
+        jmp     bignum_le_xtest
+bignum_le_xtoploop:
+        movq    (y,i,8), a
+        sbbq    $0, a
+        incq    i
+bignum_le_xtest:
+        decq    n
+        jnz     bignum_le_xtoploop
+        sbbq    a, a
+        incq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+// The case where x is longer (m > n)
+
+bignum_le_ylonger:
+        addq    m, n
+        subq    n, m
+        testq   n, n
+        jz      bignum_le_ytoploop
+bignum_le_ymainloop:
+        movq    (y,i,8), a
+        sbbq    (x,i,8), a
+        incq    i
+        decq    n
+        jnz     bignum_le_ymainloop
+bignum_le_ytoploop:
+        movl    $0, ashort
+        sbbq    (x,i,8), a
+        incq    i
+        decq    m
+        jnz     bignum_le_ytoploop
+
+        sbbq    a, a
+        incq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_lt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_lt.S
new file mode 100644
index 00000000000..95a1cc3c979
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_lt.S
@@ -0,0 +1,111 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compare bignums, x < y
+// Inputs x[m], y[n]; output function return
+//
+//    extern uint64_t bignum_lt
+//     (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX
+// Microsoft x64 ABI:   RCX = m, RDX = x, R8 = n, R9 = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_lt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_lt)
+        .text
+
+#define m %rdi
+#define x %rsi
+#define n %rdx
+#define y %rcx
+#define i %r8
+#define a %rax
+
+#define ashort %eax
+
+
+
+S2N_BN_SYMBOL(bignum_lt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Zero the main index counter for both branches
+
+        xorq    i, i
+
+// Speculatively form m := m - n and do case split
+
+        subq    n, m
+        jc      bignum_lt_ylonger
+
+// The case where x is longer or of the same size (m >= n)
+
+        incq    m
+        testq   n, n
+        jz      bignum_lt_xtest
+bignum_lt_xmainloop:
+        movq    (x,i,8), a
+        sbbq    (y,i,8), a
+        incq    i
+        decq    n
+        jnz     bignum_lt_xmainloop
+        jmp     bignum_lt_xtest
+bignum_lt_xtoploop:
+        movq    (x,i,8), a
+        sbbq    $0, a
+        incq    i
+bignum_lt_xtest:
+        decq    m
+        jnz     bignum_lt_xtoploop
+        sbbq    a, a
+        negq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+// The case where y is longer (n > m)
+
+bignum_lt_ylonger:
+        addq    n, m
+        subq    m, n
+        testq   m, m
+        jz      bignum_lt_ytoploop
+bignum_lt_ymainloop:
+        movq    (x,i,8), a
+        sbbq    (y,i,8), a
+        incq    i
+        decq    m
+        jnz     bignum_lt_ymainloop
+bignum_lt_ytoploop:
+        movl    $0, ashort
+        sbbq    (y,i,8), a
+        incq    i
+        decq    n
+        jnz     bignum_lt_ytoploop
+
+        sbbq    a, a
+        negq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_madd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_madd.S
new file mode 100644
index 00000000000..4f5f876424a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_madd.S
@@ -0,0 +1,162 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply-add, z := z + x * y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_madd
+//     (uint64_t k, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the "z := x * y + z" operation, while also returning a "next" or
+// "carry" word. In the case where m + n <= p (i.e. the pure product would
+// fit in the destination) this is the remainder for the exact result.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_madd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_madd)
+        .text
+
+// These are actually right
+
+#define p %rdi
+#define z %rsi
+#define n %r8
+
+// These are not
+
+#define c %r15
+#define h %r14
+#define l %r13
+#define x %r12
+#define y %r11
+#define i %rbx
+#define k %r10
+#define m %rbp
+
+// These are always local scratch since multiplier result is in these
+
+#define a %rax
+#define d %rdx
+
+
+
+S2N_BN_SYMBOL(bignum_madd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+        movq    64(%rsp), %r9
+#endif
+
+// We use too many registers, and also we need %rax:%rdx for multiplications
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    %rdx, m
+
+// If the result size is zero, just return %rax = 0
+// We could also do this if either input is size 0.
+
+        xorq    %rax, %rax
+        testq   p, p
+        jz      bignum_madd_end
+
+// Set initial 2-part sum to zero (we zero c inside the body)
+
+        xorq    h, h
+        xorq    l, l
+
+// Otherwise do outer loop k = 0 ... k = p - 1
+
+        xorq    k, k
+
+bignum_madd_outerloop:
+
+// Zero our carry term first; we eventually want it and a zero is useful now
+// Set a =  max 0 (k + 1 - n), i = min (k + 1) m
+// This defines the range a <= j < i for the inner summation
+// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow
+// And since we want to increment it anyway, we might as well do it now
+
+        xorq    c, c // c = 0
+        incq    k               // k = k + 1
+
+        movq    k, a // a = k + 1
+        subq    n, a // a = k + 1 - n
+        cmovcq  c, a // a = max 0 (k + 1 - n)
+
+        movq    m, i // i = m
+        cmpq    m, k // CF <=> k + 1 < m
+        cmovcq  k, i // i = min (k + 1) m
+
+// Turn i into a loop count, and skip things if it's <= 0
+// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a]
+// and then launch into the main inner loop, postdecrementing i
+
+        movq    k, d
+        subq    i, d
+        subq    a, i
+        jbe     bignum_madd_innerend
+        leaq    (%rcx,a,8), x
+        leaq    -8(%r9,d,8), y
+
+bignum_madd_innerloop:
+        movq    (y,i,8), %rax
+        mulq      (x)
+        addq    $8, x
+        addq    %rax, l
+        adcq    %rdx, h
+        adcq    $0, c
+        decq    i
+        jnz     bignum_madd_innerloop
+
+bignum_madd_innerend:
+
+        addq    l, (z)
+        adcq    $0, h
+        adcq    $0, c
+        movq    h, l
+        movq    c, h
+        addq    $8, z
+
+        cmpq    p, k
+        jc      bignum_madd_outerloop
+
+// Move the carry term into the return value
+
+        movq    l, %rax
+
+bignum_madd_end:
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modadd.S
new file mode 100644
index 00000000000..351ed07515e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modadd.S
@@ -0,0 +1,99 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo m, z := (x + y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modadd
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define x %rdx
+#define y %rcx
+#define m %r8
+#define i %r9
+#define j %r10
+#define a %rax
+#define c %r11
+
+S2N_BN_SYMBOL(bignum_modadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// If k = 0 do nothing
+
+        testq   k, k
+        jz      bignum_modadd_end
+
+// First just add (c::z) := x + y
+
+        xorq    c, c
+        movq    k, j
+        xorq    i, i
+bignum_modadd_addloop:
+        movq    (x,i,8), a
+        adcq    (y,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        decq    j
+        jnz     bignum_modadd_addloop
+        adcq    $0, c
+
+// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m
+
+        movq    k, j
+        xorq    i, i
+bignum_modadd_cmploop:
+        movq    (z,i,8), a
+        sbbq    (m,i,8), a
+        incq    i
+        decq    j
+        jnz     bignum_modadd_cmploop
+        sbbq    $0, c
+        notq    c
+
+// Now do a masked subtraction z := z - [c] * m
+
+        xorq    i, i
+bignum_modadd_subloop:
+        movq    (m,i,8), a
+        andq    c, a
+        negq    j
+        sbbq    a, (z,i,8)
+        sbbq    j, j
+        incq    i
+        cmpq    k, i
+        jc      bignum_modadd_subloop
+
+bignum_modadd_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_moddouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_moddouble.S
new file mode 100644
index 00000000000..e684d51b2ab
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_moddouble.S
@@ -0,0 +1,93 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo m, z := (2 * x) mod m, assuming x reduced
+// Inputs x[k], m[k]; output z[k]
+//
+//    extern void bignum_moddouble
+//      (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_moddouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_moddouble)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define x %rdx
+#define m %rcx
+#define i %r8
+#define a %r9
+#define c %rax
+#define b %r10
+
+S2N_BN_SYMBOL(bignum_moddouble):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// If k = 0 do nothing
+
+        testq   k, k
+        jz      bignum_moddouble_end
+
+// Do (_::z) = 2 * x - m and generate a mask in c for 2 * x < m
+
+        xorq    c, c
+        xorq    i, i
+        xorq    b, b
+
+bignum_moddouble_dubloop:
+        movq    (x,i,8), a
+        shrdq   $63, a, c
+        negq    b
+        sbbq    (m,i,8), c
+        sbbq    b, b
+        movq    c, (z,i,8)
+        movq    a, c
+        incq    i
+        cmpq    k, i
+        jc      bignum_moddouble_dubloop
+        shrq    $63, c
+
+        addq    b, c
+
+// Now do a corrective masked addition z := z + [c] * m
+
+        xorq    i, i
+        xorq    b, b
+bignum_moddouble_corrloop:
+        movq    (m,i,8), a
+        andq    c, a
+        negq    b
+        adcq    (z,i,8), a
+        sbbq    b, b
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_moddouble_corrloop
+
+bignum_moddouble_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modexp.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modexp.S
new file mode 100644
index 00000000000..d5dc9b70a3c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modexp.S
@@ -0,0 +1,671 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular exponentiation for arbitrary odd modulus
+// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k]
+//
+//   extern void bignum_modexp
+//    (uint64_t k,uint64_t *z, uint64_t *a,uint64_t *p,uint64_t *m,uint64_t *t);
+//
+// Does z := (a^p) mod m where all numbers are k-digit and m is odd
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = a, RCX = p, R8 = m, R9 = t
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = a, R9 = p, [RSP+40] = m, [RSP+48] = t
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modexp)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modexp)
+        .text
+
+// Local variables, all kept on the stack
+
+#define k (%rsp)
+#define res 8(%rsp)
+#define a 16(%rsp)
+#define p 24(%rsp)
+#define m 32(%rsp)
+#define x 40(%rsp)
+#define i 48(%rsp)
+#define y 56(%rsp)
+#define z 64(%rsp)
+
+#define VARSIZE 72
+
+S2N_BN_SYMBOL(bignum_modexp):
+        _CET_ENDBR
+
+// The Windows version literally calls the standard ABI version.
+// This simplifies the proofs since subroutine offsets are fixed.
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+        movq    64(%rsp), %r9
+        callq   bignum_modexp_standard
+        popq   %rsi
+        popq   %rdi
+        ret
+
+bignum_modexp_standard:
+#endif
+
+// Real start of the standard ABI code.
+// Bump down the stack to make room for local variables
+
+        subq    $VARSIZE, %rsp
+
+// If size is zero (which falsifies the oddness condition) do nothing
+
+        testq   %rdi, %rdi
+        jz      bignum_modexp_end
+
+// Set up local variables based on input parameters
+
+        movq    %rdi, k
+        movq    %rsi, res
+        movq    %rdx, a
+        movq    %rcx, p
+        movq    %r8, m
+        movq    %r9, x
+        leaq    (%r9,%rdi,8), %rax
+        movq    %rax, y
+        leaq    (%rax,%rdi,8), %rax
+        movq    %rax, z
+
+// Let x == 2^64k * a (mod m) and initialize z == 2^64k * 1 (mod m)
+
+        movq    k, %rdi
+        movq    z, %rsi
+        movq    m, %rdx
+        movq    y, %rcx
+        callq   bignum_modexp_local_amontifier
+
+        movq    k, %rdi
+        movq    x, %rsi
+        movq    z, %rdx
+        movq    a, %rcx
+        movq    m, %r8
+        callq   bignum_modexp_local_amontmul
+
+        movq    k, %rdi
+        movq    z, %rsi
+        movq    z, %rdx
+        movq    m, %rcx
+        callq   bignum_modexp_local_demont
+
+// Main loop with z == 2^64k * a^(p >> 2^i) (mod m)
+
+        movq    k, %rax
+        shlq    $6, %rax
+        movq    %rax, i
+
+bignum_modexp_loop:
+        subq    $1, %rax
+        movq    %rax, i
+
+        movq    k, %rdi
+        movq    y, %rsi
+        movq    z, %rdx
+        movq    z, %rcx
+        movq    m, %r8
+        callq   bignum_modexp_local_amontmul
+
+        movq    k, %rdi
+        movq    z, %rsi
+        movq    x, %rdx
+        movq    y, %rcx
+        movq    m, %r8
+        callq   bignum_modexp_local_amontmul
+
+        movq    i, %rdx
+        movq    %rdx, %rcx
+        shrq    $6, %rdx
+        movq    p, %rsi
+        movq    (%rsi,%rdx,8), %rdi
+        shrq    %cl, %rdi
+        andq    $1, %rdi
+
+        movq    k, %rsi
+        movq    z, %rdx
+        movq    z, %rcx
+        movq    y, %r8
+        callq   bignum_modexp_local_mux
+
+        movq    i, %rax
+        testq   %rax, %rax
+        jnz     bignum_modexp_loop
+
+// Convert back from Montgomery representation and copy the result
+// (via a degenerate case of multiplexing) into the output buffer
+
+        movq    k, %rdi
+        movq    z, %rsi
+        movq    z, %rdx
+        movq    m, %rcx
+        callq   bignum_modexp_local_demont
+
+        xorl    %edi, %edi
+        movq    k, %rsi
+        movq    res, %rdx
+        movq    z, %rcx
+        movq    z, %r8
+        callq   bignum_modexp_local_mux
+
+// Restore the stack pointer and return
+
+bignum_modexp_end:
+        addq    $VARSIZE, %rsp
+        ret
+
+// Local copy of bignum_amontifier
+
+bignum_modexp_local_amontifier:
+         pushq  %rbp
+         pushq  %rbx
+         pushq  %r12
+         pushq  %r13
+         movq   %rdx, %r12
+         movq   %rcx, %r13
+         testq  %rdi, %rdi
+         je     bignum_modexp_amontifier_end
+         xorq   %rbx, %rbx
+bignum_modexp_copyinloop:
+         movq   (%r12,%rbx,8), %rcx
+         movq   %rcx, (%r13,%rbx,8)
+         incq   %rbx
+         cmpq   %rdi, %rbx
+         jb     bignum_modexp_copyinloop
+         movq   %rdi, %rbx
+         decq   %rbx
+         je     bignum_modexp_normalized
+bignum_modexp_normloop:
+         xorq   %rbp, %rbp
+         movq   %rdi, %r11
+         negq   %rcx
+         movl   $0x0, %eax
+bignum_modexp_shufloop:
+         movq   %rax, %rcx
+         movq   (%r13,%rbp,8), %rax
+         cmovbq %rax, %rcx
+         movq   %rcx, (%r13,%rbp,8)
+         incq   %rbp
+         decq   %r11
+         jne    bignum_modexp_shufloop
+         decq   %rbx
+         jne    bignum_modexp_normloop
+bignum_modexp_normalized:
+         bsrq   %rcx, %rcx
+         xorq   $0x3f, %rcx
+         xorq   %r9, %r9
+         xorq   %rbx, %rbx
+bignum_modexp_bitloop:
+         movq   (%r13,%rbx,8), %rax
+         movq   %rax, %rbp
+         shldq  %cl, %r9, %rax
+         movq   %rax, (%r13,%rbx,8)
+         movq   %rbp, %r9
+         incq   %rbx
+         cmpq   %rdi, %rbx
+         jb     bignum_modexp_bitloop
+         movq   -0x8(%r13,%rdi,8), %r11
+         movl   $0x1, %r8d
+         movq   %r11, %r9
+         negq   %r9
+         movl   $0x3e, %ebx
+bignum_modexp_estloop:
+         addq   %r8, %r8
+         movq   %r11, %rax
+         subq   %r9, %rax
+         cmpq   %rax, %r9
+         sbbq   %rax, %rax
+         notq   %rax
+         subq   %rax, %r8
+         addq   %r9, %r9
+         andq   %r11, %rax
+         subq   %rax, %r9
+         decq   %rbx
+         jne    bignum_modexp_estloop
+         incq   %r9
+         cmpq   %r9, %r11
+         adcq   $0x0, %r8
+         xorq   %rcx, %rcx
+         xorq   %rbx, %rbx
+bignum_modexp_mulloop:
+         movq   (%r13,%rbx,8), %rax
+         mulq   %r8
+         addq   %rcx, %rax
+         adcq   $0x0, %rdx
+         movq   %rax, (%rsi,%rbx,8)
+         movq   %rdx, %rcx
+         incq   %rbx
+         cmpq   %rdi, %rbx
+         jb     bignum_modexp_mulloop
+         movabs $0x4000000000000000, %rax
+         subq   %rax, %rcx
+         sbbq   %r8, %r8
+         notq   %r8
+         xorq   %rcx, %rcx
+         xorq   %rbx, %rbx
+bignum_modexp_remloop:
+         movq   (%r13,%rbx,8), %rax
+         andq   %r8, %rax
+         negq   %rcx
+         sbbq   (%rsi,%rbx,8), %rax
+         sbbq   %rcx, %rcx
+         movq   %rax, (%rsi,%rbx,8)
+         incq   %rbx
+         cmpq   %rdi, %rbx
+         jb     bignum_modexp_remloop
+         xorq   %rcx, %rcx
+         xorq   %rbp, %rbp
+         xorq   %r9, %r9
+bignum_modexp_dubloop1:
+         movq   (%rsi,%rbp,8), %rax
+         shrdq  $0x3f, %rax, %rcx
+         negq   %r9
+         sbbq   (%r13,%rbp,8), %rcx
+         sbbq   %r9, %r9
+         movq   %rcx, (%rsi,%rbp,8)
+         movq   %rax, %rcx
+         incq   %rbp
+         cmpq   %rdi, %rbp
+         jb     bignum_modexp_dubloop1
+         shrq   $0x3f, %rcx
+         addq   %r9, %rcx
+         xorq   %rbp, %rbp
+         xorq   %r9, %r9
+bignum_modexp_corrloop1:
+         movq   (%r13,%rbp,8), %rax
+         andq   %rcx, %rax
+         negq   %r9
+         adcq   (%rsi,%rbp,8), %rax
+         sbbq   %r9, %r9
+         movq   %rax, (%rsi,%rbp,8)
+         incq   %rbp
+         cmpq   %rdi, %rbp
+         jb     bignum_modexp_corrloop1
+         xorq   %rcx, %rcx
+         xorq   %rbp, %rbp
+         xorq   %r9, %r9
+bignum_modexp_dubloop2:
+         movq   (%rsi,%rbp,8), %rax
+         shrdq  $0x3f, %rax, %rcx
+         negq   %r9
+         sbbq   (%r13,%rbp,8), %rcx
+         sbbq   %r9, %r9
+         movq   %rcx, (%rsi,%rbp,8)
+         movq   %rax, %rcx
+         incq   %rbp
+         cmpq   %rdi, %rbp
+         jb     bignum_modexp_dubloop2
+         shrq   $0x3f, %rcx
+         addq   %r9, %rcx
+         xorq   %rbp, %rbp
+         xorq   %r9, %r9
+bignum_modexp_corrloop2:
+         movq   (%r13,%rbp,8), %rax
+         andq   %rcx, %rax
+         negq   %r9
+         adcq   (%rsi,%rbp,8), %rax
+         sbbq   %r9, %r9
+         movq   %rax, (%rsi,%rbp,8)
+         movq   %rax, (%r13,%rbp,8)
+         incq   %rbp
+         cmpq   %rdi, %rbp
+         jb     bignum_modexp_corrloop2
+         xorq   %r11, %r11
+         movq   %rdi, %rbx
+bignum_modexp_modloop:
+         xorq   %r9, %r9
+         movq   %rdi, %r8
+         xorq   %rbp, %rbp
+         xorq   %rcx, %rcx
+bignum_modexp_cmaloop:
+         adcq   %r9, %rcx
+         sbbq   %r10, %r10
+         movq   (%rsi,%rbp,8), %rax
+         mulq   %r11
+         subq   %r10, %rdx
+         addq   %rcx, %rax
+         movq   (%r13,%rbp,8), %r9
+         movq   %rax, (%r13,%rbp,8)
+         movq   %rdx, %rcx
+         incq   %rbp
+         decq   %r8
+         jne    bignum_modexp_cmaloop
+         adcq   %rcx, %r9
+         movq   %r9, %r11
+         sbbq   %r10, %r10
+         xorq   %rbp, %rbp
+         xorq   %rcx, %rcx
+bignum_modexp_oaloop:
+         movq   (%r13,%rbp,8), %rax
+         movq   (%rsi,%rbp,8), %r9
+         andq   %r10, %r9
+         negq   %rcx
+         adcq   %r9, %rax
+         sbbq   %rcx, %rcx
+         movq   %rax, (%r13,%rbp,8)
+         incq   %rbp
+         cmpq   %rdi, %rbp
+         jb     bignum_modexp_oaloop
+         subq   %rcx, %r11
+         decq   %rbx
+         jne    bignum_modexp_modloop
+         movq   (%r12), %rax
+         movq   %rax, %rcx
+         movq   %rax, %r9
+         shlq   $0x2, %rcx
+         subq   %rcx, %r9
+         xorq   $0x2, %r9
+         movq   %r9, %rcx
+         imulq  %rax, %rcx
+         movl   $0x2, %eax
+         addq   %rcx, %rax
+         addq   $0x1, %rcx
+         imulq  %rax, %r9
+         imulq  %rcx, %rcx
+         movl   $0x1, %eax
+         addq   %rcx, %rax
+         imulq  %rax, %r9
+         imulq  %rcx, %rcx
+         movl   $0x1, %eax
+         addq   %rcx, %rax
+         imulq  %rax, %r9
+         imulq  %rcx, %rcx
+         movl   $0x1, %eax
+         addq   %rcx, %rax
+         imulq  %rax, %r9
+         movq   (%r13), %rcx
+         imulq  %rcx, %r9
+         movq   (%r12), %rax
+         mulq   %r9
+         addq   %rcx, %rax
+         movq   %rdx, %rcx
+         movl   $0x1, %ebp
+         movq   %rdi, %r8
+         decq   %r8
+         je     bignum_modexp_montifend
+bignum_modexp_montifloop:
+         adcq   (%r13,%rbp,8), %rcx
+         sbbq   %r10, %r10
+         movq   (%r12,%rbp,8), %rax
+         mulq   %r9
+         subq   %r10, %rdx
+         addq   %rcx, %rax
+         movq   %rax, -0x8(%r13,%rbp,8)
+         movq   %rdx, %rcx
+         incq   %rbp
+         decq   %r8
+         jne    bignum_modexp_montifloop
+bignum_modexp_montifend:
+         adcq   %rcx, %r11
+         sbbq   %r10, %r10
+         movq   %r11, -0x8(%r13,%rdi,8)
+         xorq   %rbp, %rbp
+         xorq   %rcx, %rcx
+bignum_modexp_osloop:
+         movq   (%r13,%rbp,8), %rax
+         movq   (%r12,%rbp,8), %r9
+         andq   %r10, %r9
+         negq   %rcx
+         sbbq   %r9, %rax
+         sbbq   %rcx, %rcx
+         movq   %rax, (%rsi,%rbp,8)
+         incq   %rbp
+         cmpq   %rdi, %rbp
+         jb     bignum_modexp_osloop
+bignum_modexp_amontifier_end:
+         popq   %r13
+         popq   %r12
+         popq   %rbx
+         popq   %rbp
+         ret
+
+// Local copy of bignum_amontmul
+
+bignum_modexp_local_amontmul:
+         pushq  %rbx
+         pushq  %rbp
+         pushq  %r12
+         pushq  %r13
+         pushq  %r14
+         pushq  %r15
+         subq   $0x8, %rsp
+         testq  %rdi, %rdi
+         je     bignum_modexp_amont_end
+         movq   %rdx, %r9
+         movq   (%r8), %rax
+         movq   %rax, %rdx
+         movq   %rax, %rbx
+         shlq   $0x2, %rdx
+         subq   %rdx, %rbx
+         xorq   $0x2, %rbx
+         movq   %rbx, %rdx
+         imulq  %rax, %rdx
+         movl   $0x2, %eax
+         addq   %rdx, %rax
+         addq   $0x1, %rdx
+         imulq  %rax, %rbx
+         imulq  %rdx, %rdx
+         movl   $0x1, %eax
+         addq   %rdx, %rax
+         imulq  %rax, %rbx
+         imulq  %rdx, %rdx
+         movl   $0x1, %eax
+         addq   %rdx, %rax
+         imulq  %rax, %rbx
+         imulq  %rdx, %rdx
+         movl   $0x1, %eax
+         addq   %rdx, %rax
+         imulq  %rax, %rbx
+         movq   %rbx, (%rsp)
+         xorq   %r13, %r13
+         xorq   %rbx, %rbx
+bignum_modexp_zoop:
+         movq   %r13, (%rsi,%rbx,8)
+         incq   %rbx
+         cmpq   %rdi, %rbx
+         jb     bignum_modexp_zoop
+         xorq   %r14, %r14
+bignum_modexp_outeramontloop:
+         movq   (%r9,%r13,8), %rbp
+         xorq   %rbx, %rbx
+         xorq   %r10, %r10
+         xorq   %r15, %r15
+         movq   %rdi, %r12
+bignum_modexp_maddloop:
+         adcq   (%rsi,%rbx,8), %r10
+         sbbq   %r11, %r11
+         movq   (%rcx,%rbx,8), %rax
+         mulq   %rbp
+         subq   %r11, %rdx
+         addq   %r10, %rax
+         movq   %rax, (%rsi,%rbx,8)
+         movq   %rdx, %r10
+         incq   %rbx
+         decq   %r12
+         jne    bignum_modexp_maddloop
+         adcq   %r10, %r14
+         adcq   %r15, %r15
+         movq   (%rsi), %r11
+         movq   (%rsp), %rbp
+         imulq  %r11, %rbp
+         movq   (%r8), %rax
+         mulq   %rbp
+         addq   %r11, %rax
+         movq   %rdx, %r10
+         movl   $0x1, %ebx
+         movq   %rdi, %r12
+         decq   %r12
+         je     bignum_modexp_montend
+bignum_modexp_montloop:
+         adcq   (%rsi,%rbx,8), %r10
+         sbbq   %r11, %r11
+         movq   (%r8,%rbx,8), %rax
+         mulq   %rbp
+         subq   %r11, %rdx
+         addq   %r10, %rax
+         movq   %rax, -0x8(%rsi,%rbx,8)
+         movq   %rdx, %r10
+         incq   %rbx
+         decq   %r12
+         jne    bignum_modexp_montloop
+bignum_modexp_montend:
+         adcq   %r14, %r10
+         adcq   $0x0, %r15
+         movq   %r15, %r14
+         movq   %r10, -0x8(%rsi,%rbx,8)
+         incq   %r13
+         cmpq   %rdi, %r13
+         jb     bignum_modexp_outeramontloop
+         xorq   %rbp, %rbp
+         subq   %r14, %rbp
+         xorq   %r11, %r11
+         xorq   %rbx, %rbx
+bignum_modexp_acorrloop:
+         movq   (%r8,%rbx,8), %rax
+         andq   %rbp, %rax
+         negq   %r11
+         sbbq   %rax, (%rsi,%rbx,8)
+         sbbq   %r11, %r11
+         incq   %rbx
+         cmpq   %rdi, %rbx
+         jb     bignum_modexp_acorrloop
+bignum_modexp_amont_end:
+         addq   $0x8, %rsp
+         popq   %r15
+         popq   %r14
+         popq   %r13
+         popq   %r12
+         popq   %rbp
+         popq   %rbx
+         ret
+
+// Local copy of bignum_demont
+
+bignum_modexp_local_demont:
+         pushq  %rbx
+         pushq  %rbp
+         pushq  %r12
+         testq  %rdi, %rdi
+         je     bignum_modexp_demont_end
+         movq   (%rcx), %rax
+         movq   %rax, %rbx
+         movq   %rax, %r8
+         shlq   $0x2, %rbx
+         subq   %rbx, %r8
+         xorq   $0x2, %r8
+         movq   %r8, %rbx
+         imulq  %rax, %rbx
+         movl   $0x2, %eax
+         addq   %rbx, %rax
+         addq   $0x1, %rbx
+         imulq  %rax, %r8
+         imulq  %rbx, %rbx
+         movl   $0x1, %eax
+         addq   %rbx, %rax
+         imulq  %rax, %r8
+         imulq  %rbx, %rbx
+         movl   $0x1, %eax
+         addq   %rbx, %rax
+         imulq  %rax, %r8
+         imulq  %rbx, %rbx
+         movl   $0x1, %eax
+         addq   %rbx, %rax
+         imulq  %rax, %r8
+         xorq   %rbx, %rbx
+bignum_modexp_iloop:
+         movq   (%rdx,%rbx,8), %rax
+         movq   %rax, (%rsi,%rbx,8)
+         incq   %rbx
+         cmpq   %rdi, %rbx
+         jb     bignum_modexp_iloop
+         xorq   %r9, %r9
+bignum_modexp_outerdemontloop:
+         movq   (%rsi), %r11
+         movq   %r8, %rbp
+         imulq  %r11, %rbp
+         movq   (%rcx), %rax
+         mulq   %rbp
+         addq   %r11, %rax
+         movq   %rdx, %r10
+         movl   $0x1, %ebx
+         movq   %rdi, %r12
+         decq   %r12
+         je     bignum_modexp_demontend
+bignum_modexp_demontloop:
+         adcq   (%rsi,%rbx,8), %r10
+         sbbq   %r11, %r11
+         movq   (%rcx,%rbx,8), %rax
+         mulq   %rbp
+         subq   %r11, %rdx
+         addq   %r10, %rax
+         movq   %rax, -0x8(%rsi,%rbx,8)
+         movq   %rdx, %r10
+         incq   %rbx
+         decq   %r12
+         jne    bignum_modexp_demontloop
+bignum_modexp_demontend:
+         adcq   $0x0, %r10
+         movq   %r10, -0x8(%rsi,%rbx,8)
+         incq   %r9
+         cmpq   %rdi, %r9
+         jb     bignum_modexp_outerdemontloop
+         xorq   %rbx, %rbx
+         movq   %rdi, %r12
+bignum_modexp_cmploop:
+         movq   (%rsi,%rbx,8), %rax
+         sbbq   (%rcx,%rbx,8), %rax
+         incq   %rbx
+         decq   %r12
+         jne    bignum_modexp_cmploop
+         sbbq   %rbp, %rbp
+         notq   %rbp
+         xorq   %r11, %r11
+         xorq   %rbx, %rbx
+bignum_modexp_dcorrloop:
+         movq   (%rcx,%rbx,8), %rax
+         andq   %rbp, %rax
+         negq   %r11
+         sbbq   %rax, (%rsi,%rbx,8)
+         sbbq   %r11, %r11
+         incq   %rbx
+         cmpq   %rdi, %rbx
+         jb     bignum_modexp_dcorrloop
+bignum_modexp_demont_end:
+         popq   %r12
+         popq   %rbp
+         popq   %rbx
+         ret
+
+// Local copy of bignum_mux
+
+bignum_modexp_local_mux:
+         testq  %rsi, %rsi
+         je     bignum_modexp_muxend
+         xorq   %r9, %r9
+         negq   %rdi
+bignum_modexp_muxloop:
+         movq   (%rcx,%r9,8), %rax
+         movq   (%r8,%r9,8), %rdi
+         cmovae %rdi, %rax
+         movq   %rax, (%rdx,%r9,8)
+         incq   %r9
+         decq   %rsi
+         jne    bignum_modexp_muxloop
+bignum_modexp_muxend:
+         ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modifier.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modifier.S
new file mode 100644
index 00000000000..35e01f3a5ba
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modifier.S
@@ -0,0 +1,541 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compute "modification" constant z := 2^{64k} mod m
+// Input m[k]; output z[k]; temporary buffer t[>=k]
+//
+//    extern void bignum_modifier
+//     (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+//
+// The last argument points to a temporary buffer t that should have size >= k.
+// This is called "mod-ifier" because given any other k-digit number x we can
+// get x MOD m simply and reasonably efficiently just by Montgomery
+// multiplication of x and z. But one can also consider it the identity for
+// Montgomery multiplication, assuming you have a reduced multiplier already.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = t
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = t
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modifier)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modifier)
+        .text
+
+#define k %rdi
+#define z %rsi
+
+// These two inputs get moved to different places since RCX and RDX are special
+
+#define m %r12
+#define t %r13
+
+// Other variables
+
+#define i %rbx
+// Modular inverse; aliased to i, but we never use them together
+#define w %rbx
+#define j %rbp
+// Matters that this is RAX for special use in multiplies
+#define a %rax
+// Matters that this is RDX for special use in multiplies
+#define d %rdx
+// Matters that this is RCX as CL=lo(c) is assumed in shifts
+#define c %rcx
+#define h %r11
+#define l %r10
+#define b %r9
+#define n %r8
+
+// Some aliases for the values b and n
+
+#define q %r8
+#define r %r9
+
+#define ashort %eax
+#define ishort %ebx
+#define jshort %ebp
+#define qshort %r8d
+
+
+S2N_BN_SYMBOL(bignum_modifier):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save some additional registers for use, copy args out of RCX and RDX
+
+        pushq   %rbp
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+
+        movq    %rdx, m
+        movq    %rcx, t
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_modifier_end
+
+// Copy the input m into the temporary buffer t. The temporary register
+// c matters since we want it to hold the highest digit, ready for the
+// normalization phase.
+
+        xorq    i, i
+bignum_modifier_copyinloop:
+        movq    (m,i,8), c
+        movq    c, (t,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_modifier_copyinloop
+
+// Do a rather stupid but constant-time digit normalization, conditionally
+// shifting left (k-1) times based on whether the top word is zero.
+// With careful binary striding this could be O(k*log(k)) instead of O(k^2)
+// while still retaining the constant-time style.
+// The "neg c" sets the zeroness predicate (~CF) for the entire inner loop
+
+        movq    k, i
+        decq    i
+        jz      bignum_modifier_normalized
+bignum_modifier_normloop:
+        xorq    j, j
+        movq    k, h
+        negq    c
+        movl    $0, ashort
+bignum_modifier_shufloop:
+        movq    a, c
+        movq    (t,j,8), a
+        cmovcq  a, c
+        movq    c, (t,j,8)
+        incq    j
+        decq    h
+        jnz     bignum_modifier_shufloop
+        decq    i
+        jnz     bignum_modifier_normloop
+
+// We now have the top digit nonzero, assuming the input was nonzero,
+// and as per the invariant of the loop above, c holds that digit. So
+// now just count c's leading zeros and shift t bitwise that many bits.
+// Note that we don't care about the result of bsr for zero inputs so
+// the simple xor-ing with 63 is safe.
+
+bignum_modifier_normalized:
+
+        bsrq    c, c
+        xorq    $63, c
+
+        xorq    b, b
+        xorq    i, i
+bignum_modifier_bitloop:
+        movq    (t,i,8), a
+        movq    a, j
+        shldq   %cl, b, a
+        movq    a, (t,i,8)
+        movq    j, b
+        incq    i
+        cmpq    k, i
+        jc      bignum_modifier_bitloop
+
+// Let h be the high word of n, which in all the in-scope cases is >= 2^63.
+// Now successively form q = 2^i div h and r = 2^i mod h as i goes from
+// 64 to 126. We avoid just using division out of constant-time concerns
+// (at the least we would need to fix up h = 0 for out-of-scope inputs) and
+// don't bother with Newton-Raphson, since this stupid simple loop doesn't
+// contribute much of the overall runtime at typical sizes.
+
+        movq    -8(t,k,8), h
+        movl    $1, qshort
+        movq    h, r
+        negq    r
+        movl    $62, ishort
+bignum_modifier_estloop:
+
+        addq    q, q
+        movq    h, a
+        subq    r, a
+        cmpq    a, r // CF <=> r < h - r <=> 2 * r < h
+        sbbq    a, a
+        notq    a       // a = bitmask(2 * r >= h)
+        subq    a, q
+        addq    r, r
+        andq    h, a
+        subq    a, r
+        decq    i
+        jnz     bignum_modifier_estloop
+
+// Strictly speaking the above loop doesn't quite give the true remainder
+// and quotient in the special case r = h = 2^63, so fix it up. We get
+// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is
+// supererogatory, because the main property of q used below still holds
+// in this case unless the initial m = 1, and then anyway the overall
+// specification (congruence modulo m) holds degenerately. But it seems
+// nicer to get a "true" quotient and remainder.
+
+        incq    r
+        cmpq    r, h
+        adcq    $0, q
+
+// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the
+// fixed-up case above: note that we never actually use the computed
+// value of r below and so didn't adjust it). And we can assume the ranges
+// q <= 2^63 and r < h < 2^64.
+//
+// The idea is to use q as a first quotient estimate for a remainder
+// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the
+// high and low parts h and l:
+//
+// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l)
+//                  = 2^{p+62} - (2^{p-64} * (q * h) + q * l)
+//                  = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l
+//                  = 2^{p-64} * r - q * l
+//
+// Note that 2^{p-64} * r < 2^{p-64} * h <= n
+// and also  q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n
+// so |diff| = |2^{p-64} * r - q * l| < n.
+//
+// If in fact diff >= 0 then it is already 2^{p+62} mod n.
+// otherwise diff + n is the right answer.
+//
+// To (maybe?) make the computation slightly easier we actually flip
+// the sign and compute d = q * n - 2^{p+62}. Then the answer is either
+// -d (when negative) or n - d; in either case we effectively negate d.
+// This negating tweak in fact spoils the result for cases where
+// 2^{p+62} mod n = 0, when we get n instead. However the only case
+// where this can happen is m = 1, when the whole spec holds trivially,
+// and actually the remainder of the logic below works anyway since
+// the latter part of the code only needs a congruence for the k-digit
+// result, not strict modular reduction (the doublings will maintain
+// the non-strict inequality).
+
+        xorq    c, c
+        xorq    i, i
+bignum_modifier_mulloop:
+        movq    (t,i,8), a
+        mulq    q
+        addq    c, a
+        adcq    $0, d
+        movq    a, (z,i,8)
+        movq    d, c
+        incq    i
+        cmpq    k, i
+        jc      bignum_modifier_mulloop
+
+// Now c is the high word of the product, so subtract 2^62
+// and then turn it into a bitmask in q = h
+
+        movq    $0x4000000000000000, a
+        subq    a, c
+        sbbq    q, q
+        notq    q
+
+// Now do [c] * n - d for our final answer
+
+        xorq    c, c
+        xorq    i, i
+bignum_modifier_remloop:
+        movq    (t,i,8), a
+        andq    q, a
+        negq    c
+        sbbq    (z,i,8), a
+        sbbq    c, c
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_modifier_remloop
+
+// Now still need to do a couple of modular doublings to get us all the
+// way up to 2^{p+64} == r from initial 2^{p+62} == r (mod n).
+
+        xorq    c, c
+        xorq    j, j
+        xorq    b, b
+bignum_modifier_dubloop1:
+        movq    (z,j,8), a
+        shrdq   $63, a, c
+        negq    b
+        sbbq    (t,j,8), c
+        sbbq    b, b
+        movq    c, (z,j,8)
+        movq    a, c
+        incq    j
+        cmpq    k, j
+        jc      bignum_modifier_dubloop1
+        shrq    $63, c
+        addq    b, c
+        xorq    j, j
+        xorq    b, b
+bignum_modifier_corrloop1:
+        movq    (t,j,8), a
+        andq    c, a
+        negq    b
+        adcq    (z,j,8), a
+        sbbq    b, b
+        movq    a, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_modifier_corrloop1
+
+// This is not exactly the same: we also copy output to t giving the
+// initialization t_1 = r == 2^{p+64} mod n for the main loop next.
+
+        xorq    c, c
+        xorq    j, j
+        xorq    b, b
+bignum_modifier_dubloop2:
+        movq    (z,j,8), a
+        shrdq   $63, a, c
+        negq    b
+        sbbq    (t,j,8), c
+        sbbq    b, b
+        movq    c, (z,j,8)
+        movq    a, c
+        incq    j
+        cmpq    k, j
+        jc      bignum_modifier_dubloop2
+        shrq    $63, c
+        addq    b, c
+        xorq    j, j
+        xorq    b, b
+bignum_modifier_corrloop2:
+        movq    (t,j,8), a
+        andq    c, a
+        negq    b
+        adcq    (z,j,8), a
+        sbbq    b, b
+        movq    a, (z,j,8)
+        movq    a, (t,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_modifier_corrloop2
+
+// We then successively generate (k+1)-digit values satisfying
+// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish
+// initialization by zeroing h initially
+
+        xorq    h, h
+
+// Then if t_i = 2^{p} * h + l
+// we have t_{i+1} == 2^64 * t_i
+//         = (2^{p+64} * h) + (2^64 * l)
+//        == r * h + l<<64
+// Do this k more times so we end up == 2^{128*k+64}, one more than we want
+//
+// Writing B = 2^{64k}, the possible correction of adding r, which for
+// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r
+// would give the overall worst-case value minus q of
+// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r]
+// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required.
+//
+// This implementation makes the shift implicit by starting b with the
+// "previous" digit (initially 0) to offset things by 1.
+
+        movq    k, i
+bignum_modifier_modloop:
+        xorq    b, b
+        movq    k, n
+        xorq    j, j
+        xorq    c, c
+bignum_modifier_cmaloop:
+        adcq    b, c
+        sbbq    l, l
+        movq    (z,j,8), a
+        mulq    h
+        subq    l, d
+        addq    c, a
+        movq    (t,j,8), b
+        movq    a, (t,j,8)
+        movq    d, c
+        incq    j
+        decq    n
+        jnz     bignum_modifier_cmaloop
+        adcq    c, b
+        movq    b, h
+
+        sbbq    l, l
+
+        xorq    j, j
+        xorq    c, c
+bignum_modifier_oaloop:
+        movq    (t,j,8), a
+        movq    (z,j,8), b
+        andq    l, b
+        negq    c
+        adcq    b, a
+        sbbq    c, c
+        movq    a, (t,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_modifier_oaloop
+        subq    c, h
+
+        decq    i
+        jnz     bignum_modifier_modloop
+
+// Compute the negated modular inverse w (same register as i, not used again).
+
+        movq    (m), a
+        movq    a, c
+        movq    a, w
+        shlq    $2, c
+        subq    c, w
+        xorq    $2, w
+        movq    w, c
+        imulq   a, c
+        movl    $2, ashort
+        addq    c, a
+        addq    $1, c
+        imulq   a, w
+        imulq   c, c
+        movl    $1, ashort
+        addq    c, a
+        imulq   a, w
+        imulq   c, c
+        movl    $1, ashort
+        addq    c, a
+        imulq   a, w
+        imulq   c, c
+        movl    $1, ashort
+        addq    c, a
+        imulq   a, w
+
+// Now do one almost-Montgomery reduction w.r.t. the original m
+// which lops off one 2^64 from the congruence and, with the usual
+// almost-Montgomery correction, gets us back inside k digits
+
+        movq    (t), c
+        movq    w, b
+        imulq   c, b
+
+        movq    (m), a
+        mulq    b
+        addq    c, a
+        movq    d, c
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_modifier_amontend
+bignum_modifier_amontloop:
+        adcq    (t,j,8), c
+        sbbq    l, l
+        movq    (m,j,8), a
+        mulq    b
+        subq    l, d
+        addq    c, a
+        movq    a, -8(t,j,8)
+        movq    d, c
+        incq    j
+        decq    n
+        jnz     bignum_modifier_amontloop
+bignum_modifier_amontend:
+        adcq    c, h
+        sbbq    l, l
+        movq    h, -8(t,k,8)
+
+        xorq    j, j
+        xorq    c, c
+bignum_modifier_aosloop:
+        movq    (t,j,8), a
+        movq    (m,j,8), b
+        andq    l, b
+        negq    c
+        sbbq    b, a
+        sbbq    c, c
+        movq    a, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_modifier_aosloop
+
+// So far, the code (basically the same as bignum_amontifier) has produced
+// a k-digit value z == 2^{128k} (mod m), not necessarily fully reduced mod m.
+// We now do a short Montgomery reduction (similar to bignum_demont) so that
+// we achieve full reduction mod m while lopping 2^{64k} off the congruence.
+// We recycle h as the somewhat strangely-named outer loop counter.
+
+        movq    k, h
+
+bignum_modifier_montouterloop:
+        movq    (z), c
+        movq    w, b
+        imulq   c, b
+        movq    (m), a
+        mulq    b
+        addq    c, a
+        movq    d, c
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_modifier_montend
+bignum_modifier_montloop:
+        adcq    (z,j,8), c
+        sbbq    l, l
+        movq    (m,j,8), a
+        mulq    b
+        subq    l, d
+        addq    c, a
+        movq    a, -8(z,j,8)
+        movq    d, c
+        incq    j
+        decq    n
+        jnz     bignum_modifier_montloop
+bignum_modifier_montend:
+        adcq    $0, c
+        movq    c, -8(z,k,8)
+
+        decq    h
+        jnz     bignum_modifier_montouterloop
+
+// Now do a comparison of z with m to set a final correction mask
+// indicating that z >= m and so we need to subtract m.
+
+        xorq    j, j
+        movq    k, n
+bignum_modifier_cmploop:
+        movq    (z,j,8), a
+        sbbq    (m,j,8), a
+        incq    j
+        decq    n
+        jnz     bignum_modifier_cmploop
+        sbbq    d, d
+        notq    d
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        xorq    l, l
+        xorq    j, j
+bignum_modifier_corrloop:
+        movq    (m,j,8), a
+        andq    d, a
+        negq    l
+        sbbq    a, (z,j,8)
+        sbbq    l, l
+        incq    j
+        cmpq    k, j
+        jc      bignum_modifier_corrloop
+
+bignum_modifier_end:
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modinv.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modinv.S
new file mode 100644
index 00000000000..f343ddd8942
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modinv.S
@@ -0,0 +1,709 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, coprime a
+// Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k]
+//
+//    extern void bignum_modinv
+//     (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t);
+//
+// k-digit (digit=64 bits) "z := a^-1 mod b" (modular inverse of a modulo b)
+// using t as a temporary buffer (t at least 3*k words = 24*k bytes), and
+// assuming that a and b are coprime *and* that b is an odd number > 1.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = a, RCX = b, R8 = t
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = a, R9 = b, [RSP+40] = t
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modinv)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modinv)
+        .text
+
+// We get CHUNKSIZE bits per outer iteration, 64 minus a few for proxy errors
+
+#define CHUNKSIZE 58
+
+// These variables are so fundamental we keep them consistently in registers.
+// k actually stays where it was at the beginning, while l gets set up  later
+
+#define k %rdi
+#define l %r13
+
+// These are kept on the stack since there aren't enough registers
+
+#define mat_mm      (%rsp)
+#define mat_mn      8(%rsp)
+#define mat_nm      16(%rsp)
+#define mat_nn      24(%rsp)
+#define t           32(%rsp)
+// Modular inverse
+#define v           40(%rsp)
+// We reconstruct n as m + 8*k as needed
+#define m           48(%rsp)
+#define w           56(%rsp)
+#define z           64(%rsp)
+// Original b pointer, not b the temp
+#define bm          72(%rsp)
+
+#define STACKVARSIZE 80
+
+// These get set to m/n or w/z during the cross-multiplications etc.
+// Otherwise they can be used as additional temporaries
+
+#define p1 %r8
+#define p2 %r15
+
+// These are shorthands for common temporary registers
+
+#define a %rax
+#define b %rbx
+#define c %rcx
+#define d %rdx
+#define i %r9
+
+// Temporaries for the top proxy selection part
+
+#define c1        %r10
+#define c2        %r11
+#define h1        %r12
+#define h2        %rbp
+#define l1        %r14
+#define l2        %rsi
+
+// Re-use for the actual proxies; m_hi = h1 and n_hi = h2 are assumed
+
+#define m_hi    %r12
+#define n_hi    %rbp
+#define m_lo    %r14
+#define n_lo    %rsi
+
+// Re-use for the matrix entries in the inner loop, though they
+// get spilled to the corresponding memory locations mat_...
+
+#define m_m     %r10
+#define m_n     %r11
+#define n_m     %rcx
+#define n_n     %rdx
+
+#define ashort %eax
+#define ishort %r9d
+#define m_mshort %r10d
+#define m_nshort %r11d
+#define n_mshort %ecx
+#define n_nshort %edx
+
+S2N_BN_SYMBOL(bignum_modinv):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Save all required registers and make room on stack for all the above vars
+
+        pushq   %rbp
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $STACKVARSIZE, %rsp
+
+// If k = 0 then do nothing (this is out of scope anyway)
+
+        testq   k, k
+        jz      bignum_modinv_end
+
+// Set up the additional two buffers m and n beyond w in temp space
+// and record all pointers m, n, w and z in stack-based variables
+
+        movq    %rsi, z
+        movq    %r8, w
+        movq    %rcx, bm
+        leaq    (%r8,k,8), %r10
+        movq    %r10, m
+        leaq    (%r10,k,8), p2
+
+// Initialize the main buffers with their starting values:
+// m = a, n = b, w = b (to be tweaked to b - 1) and z = 0
+
+        xorq    %r11, %r11
+        xorq    i, i
+bignum_modinv_copyloop:
+        movq    (%rdx,i,8), a
+        movq    (%rcx,i,8), b
+        movq    a, (%r10,i,8)
+        movq    b, (p2,i,8)
+        movq    b, (%r8,i,8)
+        movq    %r11, (%rsi,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_modinv_copyloop
+
+// Tweak down w to b - 1 (this crude approach is safe as b needs to be odd
+// for it to be in scope). We have then established the congruence invariant:
+//
+//   a * w == -m (mod b)
+//   a * z == n (mod b)
+//
+// This, with the bounds w <= b and z <= b, is maintained round the outer loop
+
+        movq    (%r8), a
+        movq    a, b
+        decq    b
+        movq    b, (%r8)
+
+// Compute v = negated modular inverse of b mod 2^64, reusing a from above
+// This is used for Montgomery reduction operations each time round the loop
+
+        movq    a, h2
+        movq    a, h1
+        shlq    $2, h2
+        subq    h2, h1
+        xorq    $2, h1
+
+        movq    h1, h2
+        imulq   a, h2
+        movl    $2, ashort
+        addq    h2, a
+        addq    $1, h2
+
+        imulq   a, h1
+
+        imulq   h2, h2
+        movl    $1, ashort
+        addq    h2, a
+        imulq   a, h1
+
+        imulq   h2, h2
+        movl    $1, ashort
+        addq    h2, a
+        imulq   a, h1
+
+        imulq   h2, h2
+        movl    $1, ashort
+        addq    h2, a
+        imulq   a, h1
+
+        movq    h1, v
+
+// Set up the outer loop count of 128 * k
+// The invariant is that m * n < 2^t at all times.
+
+        movq    k, a
+        shlq    $7, a
+        movq    a, t
+
+// Start of the main outer loop iterated t / CHUNKSIZE times
+
+bignum_modinv_outerloop:
+
+// We need only bother with sharper l = min k (ceil(t/64)) digits
+// for the computations on m and n (but we still need k for w and z).
+// Either both m and n fit in l digits, or m has become zero and so
+// nothing happens in the loop anyway and this makes no difference.
+
+        movq    t, l
+        addq    $63, l
+        shrq    $6, l
+        cmpq    k, l
+        cmovncq k, l
+
+// Select upper and lower proxies for both m and n to drive the inner
+// loop. The lower proxies are simply the lowest digits themselves,
+// m_lo = m[0] and n_lo = n[0], while the upper proxies are bitfields
+// of the two inputs selected so their top bit (63) aligns with the
+// most significant bit of *either* of the two inputs.
+
+        xorq    h1, h1 // Previous high and low for m
+        xorq    l1, l1
+        xorq    h2, h2 // Previous high and low for n
+        xorq    l2, l2
+        xorq    c2, c2 // Mask flag: previous word of one was nonzero
+        // and in this case h1 and h2 are those words
+
+        movq    m, p1
+        leaq    (p1,k,8), p2
+        xorq    i, i
+bignum_modinv_toploop:
+        movq    (p1,i,8), b
+        movq    (p2,i,8), c
+        movq    c2, c1
+        andq    h1, c1
+        andq    h2, c2
+        movq    b, a
+        orq     c, a
+        negq    a
+        cmovcq  c1, l1
+        cmovcq  c2, l2
+        cmovcq  b, h1
+        cmovcq  c, h2
+        sbbq    c2, c2
+        incq    i
+        cmpq    l, i
+        jc      bignum_modinv_toploop
+
+        movq    h1, a
+        orq     h2, a
+        bsrq    a, c
+        xorq    $63, c
+        shldq   %cl, l1, h1
+        shldq   %cl, l2, h2
+
+// m_lo = m[0], n_lo = n[0];
+
+        movq    (p1), %rax
+        movq    %rax, m_lo
+
+        movq    (p2), %rax
+        movq    %rax, n_lo
+
+// Now the inner loop, with i as loop counter from CHUNKSIZE down.
+// This records a matrix of updates to apply to the initial
+// values of m and n with, at stage j:
+//
+//     sgn * m' = (m_m * m - m_n * n) / 2^j
+//    -sgn * n' = (n_m * m - n_n * n) / 2^j
+//
+// where "sgn" is either +1 or -1, and we lose track of which except
+// that both instance above are the same. This throwing away the sign
+// costs nothing (since we have to correct in general anyway because
+// of the proxied comparison) and makes things a bit simpler. But it
+// is simply the parity of the number of times the first condition,
+// used as the swapping criterion, fires in this loop.
+
+        movl    $1, m_mshort
+        movl    $0, m_nshort
+        movl    $0, n_mshort
+        movl    $1, n_nshort
+        movl    $CHUNKSIZE, ishort
+
+// Stash more variables over the inner loop to free up regs
+
+        movq    k, mat_mn
+        movq    l, mat_nm
+        movq    p1, mat_mm
+        movq    p2, mat_nn
+
+// Conceptually in the inner loop we follow these steps:
+//
+// * If m_lo is odd and m_hi < n_hi, then swap the four pairs
+//    (m_hi,n_hi); (m_lo,n_lo); (m_m,n_m); (m_n,n_n)
+//
+// * Now, if m_lo is odd (old or new, doesn't matter as initial n_lo is odd)
+//    m_hi := m_hi - n_hi, m_lo := m_lo - n_lo
+//    m_m  := m_m + n_m, m_n := m_n + n_n
+//
+// * Halve and double them
+//     m_hi := m_hi / 2, m_lo := m_lo / 2
+//     n_m := n_m * 2, n_n := n_n * 2
+//
+// The actual computation computes updates before actually swapping and
+// then corrects as needed.
+
+bignum_modinv_innerloop:
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorq    p1, p1
+        xorq    p2, p2
+        btq     $0, m_lo
+
+        cmovcq  n_hi, %rax
+        cmovcq  n_lo, %rbx
+        cmovcq  n_m, p1
+        cmovcq  n_n, p2
+
+        movq    m_lo, l
+        subq    %rbx, m_lo
+        subq    l, %rbx
+        movq    m_hi, k
+        subq    %rax, k
+        cmovcq  m_hi, n_hi
+        leaq    -1(k), m_hi
+        cmovcq  %rbx, m_lo
+        cmovcq  l, n_lo
+        notq    m_hi
+        cmovcq  m_m, n_m
+        cmovcq  m_n, n_n
+        cmovncq k, m_hi
+
+        shrq    $1, m_lo
+        addq    p1, m_m
+        addq    p2, m_n
+        shrq    $1, m_hi
+        addq    n_m, n_m
+        addq    n_n, n_n
+
+// End of the inner for-loop
+
+        decq    i
+        jnz     bignum_modinv_innerloop
+
+// Unstash the temporary variables
+
+        movq    mat_mn, k
+        movq    mat_nm, l
+        movq    mat_mm, p1
+        movq    mat_nn, p2
+
+// Put the matrix entries in memory since we're out of registers
+// We pull them out repeatedly in the next loop
+
+        movq    m_m, mat_mm
+        movq    m_n, mat_mn
+        movq    n_m, mat_nm
+        movq    n_n, mat_nn
+
+// Apply the update to w and z, using addition in this case, and also take
+// the chance to shift an additional 6 = 64-CHUNKSIZE bits to be ready for a
+// Montgomery multiplication. Because we know that m_m + m_n <= 2^58 and
+// w, z <= b < 2^{64k}, we know that both of these fit in k+1 words.
+// We do this before the m-n update to allow us to play with c1 and c2 here.
+//
+//    l1::w = 2^6 * (m_m * w + m_n * z)
+//    l2::z = 2^6 * (n_m * w + n_n * z)
+//
+// with c1 and c2 recording previous words for the shifting part
+
+        movq    w, p1
+        movq    z, p2
+        xorq    l1, l1
+        xorq    l2, l2
+        xorq    c1, c1
+        xorq    c2, c2
+        xorq    i, i
+bignum_modinv_congloop:
+
+        movq    (p1,i,8), c
+        movq    mat_mm, a
+        mulq    c
+        addq    a, l1
+        adcq    $0, d
+        movq    d, h1 // Now h1::l1 := m_m * w + l1_in
+
+        movq    mat_nm, a
+        mulq    c
+        addq    a, l2
+        adcq    $0, d
+        movq    d, h2 // Now h2::l2 := n_m * w + l2_in
+
+        movq    (p2,i,8), c
+        movq    mat_mn, a
+        mulq    c
+        addq    a, l1
+        adcq    d, h1 // h1::l1 := m_m * w + m_n * z + l1_in
+        shrdq   $CHUNKSIZE, l1, c1
+        movq    c1, (p1,i,8)
+        movq    l1, c1
+        movq    h1, l1
+
+        movq    mat_nn, a
+        mulq    c
+        addq    a, l2
+        adcq    d, h2 // h2::l2 := n_m * w + n_n * z + l2_in
+        shrdq   $CHUNKSIZE, l2, c2
+        movq    c2, (p2,i,8)
+        movq    l2, c2
+        movq    h2, l2
+
+        incq    i
+        cmpq    k, i
+        jc      bignum_modinv_congloop
+
+        shldq   $64-CHUNKSIZE, c1, l1
+        shldq   $64-CHUNKSIZE, c2, l2
+
+// Do a Montgomery reduction of l1::w
+
+        movq    bm, p2
+
+        movq    (p1), b
+        movq    v, h1
+        imulq   b, h1
+        movq    (p2), a
+        mulq    h1
+        addq    b, a // Will be zero but want the carry
+        movq    %rdx, c1
+        movl    $1, ishort
+        movq    k, c
+        decq    c
+        jz      bignum_modinv_wmontend
+
+bignum_modinv_wmontloop:
+        adcq    (p1,i,8), c1
+        sbbq    b, b
+        movq    (p2,i,8), a
+        mulq    h1
+        subq    b, %rdx
+        addq    c1, a
+        movq    a, -8(p1,i,8)
+        movq    %rdx, c1
+        incq    i
+        decq    c
+        jnz     bignum_modinv_wmontloop
+
+bignum_modinv_wmontend:
+        adcq    l1, c1
+        movq    c1, -8(p1,k,8)
+        sbbq    c1, c1
+        negq    c1
+
+        movq    k, c
+        xorq    i, i
+bignum_modinv_wcmploop:
+        movq    (p1,i,8), a
+        sbbq    (p2,i,8), a
+        incq    i
+        decq    c
+        jnz     bignum_modinv_wcmploop
+        sbbq    $0, c1
+        sbbq    c1, c1
+        notq    c1
+
+        xorq    c, c
+        xorq    i, i
+bignum_modinv_wcorrloop:
+        movq    (p1,i,8), a
+        movq    (p2,i,8), b
+        andq    c1, b
+        negq    c
+        sbbq    b, a
+        sbbq    c, c
+        movq    a, (p1,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_modinv_wcorrloop
+
+// Do a Montgomery reduction of l2::z
+
+        movq    z, p1
+
+        movq    (p1), b
+        movq    v, h2
+        imulq   b, h2
+        movq    (p2), a
+        mulq    h2
+        addq    b, a // Will be zero but want the carry
+        movq    %rdx, c2
+        movl    $1, ishort
+        movq    k, c
+        decq    c
+        jz      bignum_modinv_zmontend
+
+bignum_modinv_zmontloop:
+        adcq    (p1,i,8), c2
+        sbbq    b, b
+        movq    (p2,i,8), a
+        mulq    h2
+        subq    b, %rdx
+        addq    c2, a
+        movq    a, -8(p1,i,8)
+        movq    %rdx, c2
+        incq    i
+        decq    c
+        jnz     bignum_modinv_zmontloop
+
+bignum_modinv_zmontend:
+        adcq    l2, c2
+        movq    c2, -8(p1,k,8)
+        sbbq    c2, c2
+        negq    c2
+
+        movq    k, c
+        xorq    i, i
+bignum_modinv_zcmploop:
+        movq    (p1,i,8), a
+        sbbq    (p2,i,8), a
+        incq    i
+        decq    c
+        jnz     bignum_modinv_zcmploop
+        sbbq    $0, c2
+        sbbq    c2, c2
+        notq    c2
+
+        xorq    c, c
+        xorq    i, i
+bignum_modinv_zcorrloop:
+        movq    (p1,i,8), a
+        movq    (p2,i,8), b
+        andq    c2, b
+        negq    c
+        sbbq    b, a
+        sbbq    c, c
+        movq    a, (p1,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_modinv_zcorrloop
+
+// Now actually compute the updates to m and n corresponding to the matrix,
+// and correct the signs if they have gone negative. First we compute the
+// (k+1)-sized updates with the following invariant (here h1 and h2 are in
+// fact carry bitmasks, either 0 or -1):
+//
+//    h1::l1::m = m_m * m - m_n * n
+//    h2::l2::n = n_m * m - n_n * n
+
+        movq    m, p1
+        leaq    (p1,k,8), p2
+        xorq    i, i
+        xorq    h1, h1
+        xorq    l1, l1
+        xorq    h2, h2
+        xorq    l2, l2
+bignum_modinv_crossloop:
+
+        movq    (p1,i,8), c
+        movq    mat_mm, a
+        mulq    c
+        addq    a, l1
+        adcq    $0, d
+        movq    d, c1 // Now c1::l1 is +ve part 1
+
+        movq    mat_nm, a
+        mulq    c
+        addq    a, l2
+        adcq    $0, d
+        movq    d, c2 // Now c2::l2 is +ve part 2
+
+        movq    (p2,i,8), c
+        movq    mat_mn, a
+        mulq    c
+        subq    h1, d // Now d::a is -ve part 1
+
+        subq    a, l1
+        sbbq    d, c1
+        sbbq    h1, h1
+        movq    l1, (p1,i,8)
+        movq    c1, l1
+
+        movq    mat_nn, a
+        mulq    c
+        subq    h2, d // Now d::a is -ve part 2
+
+        subq    a, l2
+        sbbq    d, c2
+        sbbq    h2, h2
+        movq    l2, (p2,i,8)
+        movq    c2, l2
+
+        incq    i
+        cmpq    l, i
+        jc      bignum_modinv_crossloop
+
+// Now fix the signs of m and n if they have gone negative
+
+        xorq    i, i
+        movq    h1, c1 // carry-in coded up as well
+        movq    h2, c2 // carry-in coded up as well
+        xorq    h1, l1 // for the bignum_modinv_end digit
+        xorq    h2, l2 // for the bignum_modinv_end digit
+bignum_modinv_optnegloop:
+        movq    (p1,i,8), a
+        xorq    h1, a
+        negq    c1
+        adcq    $0, a
+        sbbq    c1, c1
+        movq    a, (p1,i,8)
+        movq    (p2,i,8), a
+        xorq    h2, a
+        negq    c2
+        adcq    $0, a
+        sbbq    c2, c2
+        movq    a, (p2,i,8)
+        incq    i
+        cmpq    l, i
+        jc      bignum_modinv_optnegloop
+        subq    c1, l1
+        subq    c2, l2
+
+// Now shift them right CHUNKSIZE bits
+
+        movq    l, i
+bignum_modinv_shiftloop:
+        movq    -8(p1,i,8), a
+        movq    a, c1
+        shrdq   $CHUNKSIZE, l1, a
+        movq    a, -8(p1,i,8)
+        movq    c1, l1
+        movq    -8(p2,i,8), a
+        movq    a, c2
+        shrdq   $CHUNKSIZE, l2, a
+        movq    a, -8(p2,i,8)
+        movq    c2, l2
+        decq    i
+        jnz     bignum_modinv_shiftloop
+
+// Finally, use the signs h1 and h2 to do optional modular negations of
+// w and z respectively, flipping h2 to make signs work. We don't make
+// any checks for zero values, but we certainly retain w <= b and z <= b.
+// This is enough for the Montgomery step in the next iteration to give
+// strict reduction w < b amd z < b, and anyway when we terminate we
+// could not have z = b since it violates the coprimality assumption for
+// in-scope cases.
+
+        notq    h2
+        movq    bm, c
+        movq    w, p1
+        movq    z, p2
+        movq    h1, c1
+        movq    h2, c2
+        xorq    i, i
+bignum_modinv_fliploop:
+        movq    h2, d
+        movq    (c,i,8), a
+        andq    a, d
+        andq    h1, a
+        movq    (p1,i,8), b
+        xorq    h1, b
+        negq    c1
+        adcq    b, a
+        sbbq    c1, c1
+        movq    a, (p1,i,8)
+        movq    (p2,i,8), b
+        xorq    h2, b
+        negq    c2
+        adcq    b, d
+        sbbq    c2, c2
+        movq    d, (p2,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_modinv_fliploop
+
+// End of main loop. We can stop if t' <= 0 since then m * n < 2^0, which
+// since n is odd and m and n are coprime (in the in-scope cases) means
+// m = 0, n = 1 and hence from the congruence invariant a * z == 1 (mod b).
+// Moreover we do in fact need to maintain strictly t > 0 in the main loop,
+// or the computation of the optimized digit bound l could collapse to 0.
+
+        subq    $CHUNKSIZE, t
+        jnbe    bignum_modinv_outerloop
+
+bignum_modinv_end:
+        addq    $STACKVARSIZE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modoptneg.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modoptneg.S
new file mode 100644
index 00000000000..b575d00127b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modoptneg.S
@@ -0,0 +1,97 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x
+// (if p zero), assuming x reduced
+// Inputs p, x[k], m[k]; output z[k]
+//
+//    extern void bignum_modoptneg
+//      (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = p, RCX = x, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = p, R9 = x, [RSP+40] = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modoptneg)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modoptneg)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define p %rdx
+#define x %rcx
+#define m %r8
+
+#define a %r9
+#define c %rax
+#define b %r10
+#define i %r11
+
+S2N_BN_SYMBOL(bignum_modoptneg):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Do nothing if k = 0
+
+        testq   k, k
+        jz      bignum_modoptneg_end
+
+// Make an additional check for zero input, and force p to zero in this case.
+// This can be skipped if the input is known not to be zero a priori.
+
+        xorq    i, i
+        xorq    a, a
+bignum_modoptneg_cmploop:
+        orq     (x,i,8), a
+        incq    i
+        cmpq    k, i
+        jc      bignum_modoptneg_cmploop
+
+        cmpq    $0, a
+        cmovzq  a, p
+
+// Turn the input p into a strict bitmask
+
+        negq    p
+        sbbq    p, p
+
+// Main loop
+
+        xorq    i, i
+        movq    p, c
+bignum_modoptneg_mainloop:
+        movq    (m,i,8), a
+        andq    p, a
+        movq    (x,i,8), b
+        xorq    p, b
+        negq    c
+        adcq    b, a
+        sbbq    c, c
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_modoptneg_mainloop
+
+bignum_modoptneg_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modsub.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modsub.S
new file mode 100644
index 00000000000..738a1bbb190
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modsub.S
@@ -0,0 +1,86 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_modsub
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define x %rdx
+#define y %rcx
+#define m %r8
+#define i %r9
+#define j %r10
+#define a %rax
+#define c %r11
+
+S2N_BN_SYMBOL(bignum_modsub):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// If k = 0 do nothing
+
+        testq   k, k
+        jz      bignum_modsub_end
+
+// Subtract z := x - y and record a mask for the carry x - y < 0
+
+        xorq    c, c
+        movq    k, j
+        xorq    i, i
+bignum_modsub_subloop:
+        movq    (x,i,8), a
+        sbbq    (y,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        decq    j
+        jnz     bignum_modsub_subloop
+        sbbq    c, c
+
+// Now do a masked addition z := z + [c] * m
+
+        xorq    i, i
+bignum_modsub_addloop:
+        movq    (m,i,8), a
+        andq    c, a
+        negq    j
+        adcq    a, (z,i,8)
+        sbbq    j, j
+        incq    i
+        cmpq    k, i
+        jc      bignum_modsub_addloop
+
+bignum_modsub_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montifier.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montifier.S
new file mode 100644
index 00000000000..c14035c15f0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montifier.S
@@ -0,0 +1,540 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Compute "montification" constant z := 2^{128k} mod m
+// Input m[k]; output z[k]; temporary buffer t[>=k]
+//
+//    extern void bignum_montifier
+//      (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t);
+//
+// The last argument points to a temporary buffer t that should have size >= k.
+// This is called "montifier" because given any other k-digit number x,
+// whether or not it's reduced modulo m, it can be mapped to its Montgomery
+// representation (2^{64k} * x) mod m just by Montgomery multiplication by z.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = t
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = t
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montifier)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montifier)
+        .text
+
+#define k %rdi
+#define z %rsi
+
+// These two inputs get moved to different places since RCX and RDX are special
+
+#define m %r12
+#define t %r13
+
+// Other variables
+
+#define i %rbx
+// Modular inverse; aliased to i, but we never use them together
+#define w %rbx
+#define j %rbp
+// Matters that this is RAX for special use in multiplies
+#define a %rax
+// Matters that this is RDX for special use in multiplies
+#define d %rdx
+// Matters that this is RCX as CL=lo(c) is assumed in shifts
+#define c %rcx
+#define h %r11
+#define l %r10
+#define b %r9
+#define n %r8
+
+// Some aliases for the values b and n
+
+#define q %r8
+#define r %r9
+
+#define ashort %eax
+#define ishort %ebx
+#define jshort %ebp
+#define qshort %r8d
+
+
+S2N_BN_SYMBOL(bignum_montifier):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save some additional registers for use, copy args out of RCX and RDX
+
+        pushq   %rbp
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+
+        movq    %rdx, m
+        movq    %rcx, t
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_montifier_end
+
+// Copy the input m into the temporary buffer t. The temporary register
+// c matters since we want it to hold the highest digit, ready for the
+// normalization phase.
+
+        xorq    i, i
+bignum_montifier_copyinloop:
+        movq    (m,i,8), c
+        movq    c, (t,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_montifier_copyinloop
+
+// Do a rather stupid but constant-time digit normalization, conditionally
+// shifting left (k-1) times based on whether the top word is zero.
+// With careful binary striding this could be O(k*log(k)) instead of O(k^2)
+// while still retaining the constant-time style.
+// The "neg c" sets the zeroness predicate (~CF) for the entire inner loop
+
+        movq    k, i
+        decq    i
+        jz      bignum_montifier_normalized
+bignum_montifier_normloop:
+        xorq    j, j
+        movq    k, h
+        negq    c
+        movl    $0, ashort
+bignum_montifier_shufloop:
+        movq    a, c
+        movq    (t,j,8), a
+        cmovcq  a, c
+        movq    c, (t,j,8)
+        incq    j
+        decq    h
+        jnz     bignum_montifier_shufloop
+        decq    i
+        jnz     bignum_montifier_normloop
+
+// We now have the top digit nonzero, assuming the input was nonzero,
+// and as per the invariant of the loop above, c holds that digit. So
+// now just count c's leading zeros and shift t bitwise that many bits.
+// Note that we don't care about the result of bsr for zero inputs so
+// the simple xor-ing with 63 is safe.
+
+bignum_montifier_normalized:
+
+        bsrq    c, c
+        xorq    $63, c
+
+        xorq    b, b
+        xorq    i, i
+bignum_montifier_bitloop:
+        movq    (t,i,8), a
+        movq    a, j
+        shldq   %cl, b, a
+        movq    a, (t,i,8)
+        movq    j, b
+        incq    i
+        cmpq    k, i
+        jc      bignum_montifier_bitloop
+
+// Let h be the high word of n, which in all the in-scope cases is >= 2^63.
+// Now successively form q = 2^i div h and r = 2^i mod h as i goes from
+// 64 to 126. We avoid just using division out of constant-time concerns
+// (at the least we would need to fix up h = 0 for out-of-scope inputs) and
+// don't bother with Newton-Raphson, since this stupid simple loop doesn't
+// contribute much of the overall runtime at typical sizes.
+
+        movq    -8(t,k,8), h
+        movl    $1, qshort
+        movq    h, r
+        negq    r
+        movl    $62, ishort
+bignum_montifier_estloop:
+
+        addq    q, q
+        movq    h, a
+        subq    r, a
+        cmpq    a, r // CF <=> r < h - r <=> 2 * r < h
+        sbbq    a, a
+        notq    a       // a = bitmask(2 * r >= h)
+        subq    a, q
+        addq    r, r
+        andq    h, a
+        subq    a, r
+        decq    i
+        jnz     bignum_montifier_estloop
+
+// Strictly speaking the above loop doesn't quite give the true remainder
+// and quotient in the special case r = h = 2^63, so fix it up. We get
+// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is
+// supererogatory, because the main property of q used below still holds
+// in this case unless the initial m = 1, and then anyway the overall
+// specification (congruence modulo m) holds degenerately. But it seems
+// nicer to get a "true" quotient and remainder.
+
+        incq    r
+        cmpq    r, h
+        adcq    $0, q
+
+// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the
+// fixed-up case above: note that we never actually use the computed
+// value of r below and so didn't adjust it). And we can assume the ranges
+// q <= 2^63 and r < h < 2^64.
+//
+// The idea is to use q as a first quotient estimate for a remainder
+// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the
+// high and low parts h and l:
+//
+// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l)
+//                  = 2^{p+62} - (2^{p-64} * (q * h) + q * l)
+//                  = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l
+//                  = 2^{p-64} * r - q * l
+//
+// Note that 2^{p-64} * r < 2^{p-64} * h <= n
+// and also  q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n
+// so |diff| = |2^{p-64} * r - q * l| < n.
+//
+// If in fact diff >= 0 then it is already 2^{p+62} mod n.
+// otherwise diff + n is the right answer.
+//
+// To (maybe?) make the computation slightly easier we actually flip
+// the sign and compute d = q * n - 2^{p+62}. Then the answer is either
+// -d (when negative) or n - d; in either case we effectively negate d.
+// This negating tweak in fact spoils the result for cases where
+// 2^{p+62} mod n = 0, when we get n instead. However the only case
+// where this can happen is m = 1, when the whole spec holds trivially,
+// and actually the remainder of the logic below works anyway since
+// the latter part of the code only needs a congruence for the k-digit
+// result, not strict modular reduction (the doublings will maintain
+// the non-strict inequality).
+
+        xorq    c, c
+        xorq    i, i
+bignum_montifier_mulloop:
+        movq    (t,i,8), a
+        mulq    q
+        addq    c, a
+        adcq    $0, d
+        movq    a, (z,i,8)
+        movq    d, c
+        incq    i
+        cmpq    k, i
+        jc      bignum_montifier_mulloop
+
+// Now c is the high word of the product, so subtract 2^62
+// and then turn it into a bitmask in q = h
+
+        movq    $0x4000000000000000, a
+        subq    a, c
+        sbbq    q, q
+        notq    q
+
+// Now do [c] * n - d for our final answer
+
+        xorq    c, c
+        xorq    i, i
+bignum_montifier_remloop:
+        movq    (t,i,8), a
+        andq    q, a
+        negq    c
+        sbbq    (z,i,8), a
+        sbbq    c, c
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_montifier_remloop
+
+// Now still need to do a couple of modular doublings to get us all the
+// way up to 2^{p+64} == r from initial 2^{p+62} == r (mod n).
+
+        xorq    c, c
+        xorq    j, j
+        xorq    b, b
+bignum_montifier_dubloop1:
+        movq    (z,j,8), a
+        shrdq   $63, a, c
+        negq    b
+        sbbq    (t,j,8), c
+        sbbq    b, b
+        movq    c, (z,j,8)
+        movq    a, c
+        incq    j
+        cmpq    k, j
+        jc      bignum_montifier_dubloop1
+        shrq    $63, c
+        addq    b, c
+        xorq    j, j
+        xorq    b, b
+bignum_montifier_corrloop1:
+        movq    (t,j,8), a
+        andq    c, a
+        negq    b
+        adcq    (z,j,8), a
+        sbbq    b, b
+        movq    a, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_montifier_corrloop1
+
+// This is not exactly the same: we also copy output to t giving the
+// initialization t_1 = r == 2^{p+64} mod n for the main loop next.
+
+        xorq    c, c
+        xorq    j, j
+        xorq    b, b
+bignum_montifier_dubloop2:
+        movq    (z,j,8), a
+        shrdq   $63, a, c
+        negq    b
+        sbbq    (t,j,8), c
+        sbbq    b, b
+        movq    c, (z,j,8)
+        movq    a, c
+        incq    j
+        cmpq    k, j
+        jc      bignum_montifier_dubloop2
+        shrq    $63, c
+        addq    b, c
+        xorq    j, j
+        xorq    b, b
+bignum_montifier_corrloop2:
+        movq    (t,j,8), a
+        andq    c, a
+        negq    b
+        adcq    (z,j,8), a
+        sbbq    b, b
+        movq    a, (z,j,8)
+        movq    a, (t,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_montifier_corrloop2
+
+// We then successively generate (k+1)-digit values satisfying
+// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish
+// initialization by zeroing h initially
+
+        xorq    h, h
+
+// Then if t_i = 2^{p} * h + l
+// we have t_{i+1} == 2^64 * t_i
+//         = (2^{p+64} * h) + (2^64 * l)
+//        == r * h + l<<64
+// Do this 2*k more times so we end up == 2^{192*k+64}, one more than we want
+//
+// Writing B = 2^{64k}, the possible correction of adding r, which for
+// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r
+// would give the overall worst-case value minus q of
+// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r]
+// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required.
+//
+// This implementation makes the shift implicit by starting b with the
+// "previous" digit (initially 0) to offset things by 1.
+
+        leaq    (k,k), i
+bignum_montifier_modloop:
+        xorq    b, b
+        movq    k, n
+        xorq    j, j
+        xorq    c, c
+bignum_montifier_cmaloop:
+        adcq    b, c
+        sbbq    l, l
+        movq    (z,j,8), a
+        mulq    h
+        subq    l, d
+        addq    c, a
+        movq    (t,j,8), b
+        movq    a, (t,j,8)
+        movq    d, c
+        incq    j
+        decq    n
+        jnz     bignum_montifier_cmaloop
+        adcq    c, b
+        movq    b, h
+
+        sbbq    l, l
+
+        xorq    j, j
+        xorq    c, c
+bignum_montifier_oaloop:
+        movq    (t,j,8), a
+        movq    (z,j,8), b
+        andq    l, b
+        negq    c
+        adcq    b, a
+        sbbq    c, c
+        movq    a, (t,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_montifier_oaloop
+        subq    c, h
+
+        decq    i
+        jnz     bignum_montifier_modloop
+
+// Compute the negated modular inverse w (same register as i, not used again).
+
+        movq    (m), a
+        movq    a, c
+        movq    a, w
+        shlq    $2, c
+        subq    c, w
+        xorq    $2, w
+        movq    w, c
+        imulq   a, c
+        movl    $2, ashort
+        addq    c, a
+        addq    $1, c
+        imulq   a, w
+        imulq   c, c
+        movl    $1, ashort
+        addq    c, a
+        imulq   a, w
+        imulq   c, c
+        movl    $1, ashort
+        addq    c, a
+        imulq   a, w
+        imulq   c, c
+        movl    $1, ashort
+        addq    c, a
+        imulq   a, w
+
+// Now do one almost-Montgomery reduction w.r.t. the original m
+// which lops off one 2^64 from the congruence and, with the usual
+// almost-Montgomery correction, gets us back inside k digits
+
+        movq    (t), c
+        movq    w, b
+        imulq   c, b
+
+        movq    (m), a
+        mulq    b
+        addq    c, a
+        movq    d, c
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_montifier_amontend
+bignum_montifier_amontloop:
+        adcq    (t,j,8), c
+        sbbq    l, l
+        movq    (m,j,8), a
+        mulq    b
+        subq    l, d
+        addq    c, a
+        movq    a, -8(t,j,8)
+        movq    d, c
+        incq    j
+        decq    n
+        jnz     bignum_montifier_amontloop
+bignum_montifier_amontend:
+        adcq    c, h
+        sbbq    l, l
+        movq    h, -8(t,k,8)
+
+        xorq    j, j
+        xorq    c, c
+bignum_montifier_aosloop:
+        movq    (t,j,8), a
+        movq    (m,j,8), b
+        andq    l, b
+        negq    c
+        sbbq    b, a
+        sbbq    c, c
+        movq    a, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_montifier_aosloop
+
+// So far, the code (basically a variant of bignum_amontifier) has produced
+// a k-digit value z == 2^{192k} (mod m), not necessarily fully reduced mod m.
+// We now do a short Montgomery reduction (similar to bignum_demont) so that
+// we achieve full reduction mod m while lopping 2^{64k} off the congruence.
+// We recycle h as the somewhat strangely-named outer loop counter.
+
+        movq    k, h
+
+bignum_montifier_montouterloop:
+        movq    (z), c
+        movq    w, b
+        imulq   c, b
+        movq    (m), a
+        mulq    b
+        addq    c, a
+        movq    d, c
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_montifier_montend
+bignum_montifier_montloop:
+        adcq    (z,j,8), c
+        sbbq    l, l
+        movq    (m,j,8), a
+        mulq    b
+        subq    l, d
+        addq    c, a
+        movq    a, -8(z,j,8)
+        movq    d, c
+        incq    j
+        decq    n
+        jnz     bignum_montifier_montloop
+bignum_montifier_montend:
+        adcq    $0, c
+        movq    c, -8(z,k,8)
+
+        decq    h
+        jnz     bignum_montifier_montouterloop
+
+// Now do a comparison of z with m to set a final correction mask
+// indicating that z >= m and so we need to subtract m.
+
+        xorq    j, j
+        movq    k, n
+bignum_montifier_cmploop:
+        movq    (z,j,8), a
+        sbbq    (m,j,8), a
+        incq    j
+        decq    n
+        jnz     bignum_montifier_cmploop
+        sbbq    d, d
+        notq    d
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        xorq    l, l
+        xorq    j, j
+bignum_montifier_corrloop:
+        movq    (m,j,8), a
+        andq    d, a
+        negq    l
+        sbbq    a, (z,j,8)
+        sbbq    l, l
+        incq    j
+        cmpq    k, j
+        jc      bignum_montifier_corrloop
+
+bignum_montifier_end:
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        popq    %rbp
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montmul.S
new file mode 100644
index 00000000000..0a914f8f4fe
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montmul.S
@@ -0,0 +1,260 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^{64k}) mod m
+// Inputs x[k], y[k], m[k]; output z[k]
+//
+//    extern void bignum_montmul
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m);
+//
+// Does z := (x * y / 2^{64k}) mod m, assuming x * y <= 2^{64k} * m, which is
+// guaranteed in particular if x < m, y < m initially (the "intended" case).
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul)
+        .text
+
+// We copy x to %r9 but it comes in in %rdx originally
+
+#define k %rdi
+#define z %rsi
+#define x %r9
+#define y %rcx
+#define m %r8
+
+// General temp, low part of product and mul input
+#define a %rax
+// General temp, High part of product
+#define b %rdx
+// Inner loop counter
+#define j %rbx
+// Home for i'th digit or Montgomery multiplier
+#define d %rbp
+#define h %r10
+#define e %r11
+#define n %r12
+#define i %r13
+#define c0 %r14
+#define c1 %r15
+
+// This one variable we store on the stack as we are a register short.
+// At least it's only used once per iteration of the outer loop (k times)
+// and with a single read each time, after one initial write. It's the
+// word-level negated modular inverse.
+
+#define w  (%rsp)
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+
+#define t1 %rbx
+#define t2 %rdx
+
+#define ashort %eax
+#define jshort %ebx
+
+
+S2N_BN_SYMBOL(bignum_montmul):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Save registers and allocate space on stack for non-register variable w
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $8, %rsp
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_montmul_end
+
+// Move x input into its permanent home, since we need %rdx for multiplications
+
+        movq    %rdx, x
+
+// Compute word-level negated modular inverse w for m == m[0].
+
+        movq    (m), a
+
+        movq    a, t2
+        movq    a, t1
+        shlq    $2, t2
+        subq    t2, t1
+        xorq    $2, t1
+
+        movq    t1, t2
+        imulq   a, t2
+        movl    $2, ashort
+        addq    t2, a
+        addq    $1, t2
+
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        movq    t1, w
+
+// Initialize the output c0::z to zero so we can then consistently add rows.
+// It would be a bit more efficient to special-case the zeroth row, but
+// this keeps the code slightly simpler.
+
+        xorq    i, i // Also initializes i for main loop
+        xorq    j, j
+bignum_montmul_zoop:
+        movq    i, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_montmul_zoop
+
+        xorq    c0, c0
+
+// Outer loop pulling down digits d=x[i], multiplying by y and reducing
+
+bignum_montmul_outerloop:
+
+// Multiply-add loop where we always have CF + previous high part h to add in.
+// Note that in general we do need yet one more carry in this phase and hence
+// initialize c1 with the top carry.
+
+        movq    (x,i,8), d
+        xorq    j, j
+        xorq    h, h
+        xorq    c1, c1
+        movq    k, n
+
+bignum_montmul_maddloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (y,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, (z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    n
+        jnz     bignum_montmul_maddloop
+        adcq    h, c0
+        adcq    c1, c1
+
+// Montgomery reduction loop, similar but offsetting writebacks
+
+        movq    (z), e
+        movq    w, d
+        imulq   e, d
+        movq    (m), a
+        mulq    d
+        addq    e, a // Will be zero but want the carry
+        movq    %rdx, h
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_montmul_montend
+
+bignum_montmul_montloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (m,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, -8(z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    n
+        jnz     bignum_montmul_montloop
+
+bignum_montmul_montend:
+        adcq    c0, h
+        adcq    $0, c1
+        movq    c1, c0
+        movq    h, -8(z,j,8)
+
+// End of outer loop.
+
+        incq    i
+        cmpq    k, i
+        jc      bignum_montmul_outerloop
+
+// Now do a comparison of (c0::z) with (0::m) to set a final correction mask
+// indicating that (c0::z) >= m and so we need to subtract m.
+
+        xorq    j, j
+        movq    k, n
+bignum_montmul_cmploop:
+        movq    (z,j,8), a
+        sbbq    (m,j,8), a
+        incq    j
+        decq    n
+        jnz     bignum_montmul_cmploop
+
+        sbbq    $0, c0
+        sbbq    d, d
+        notq    d
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        xorq    e, e
+        xorq    j, j
+bignum_montmul_corrloop:
+        movq    (m,j,8), a
+        andq    d, a
+        negq    e
+        sbbq    a, (z,j,8)
+        sbbq    e, e
+        incq    j
+        cmpq    k, j
+        jc      bignum_montmul_corrloop
+
+bignum_montmul_end:
+        addq    $8, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montredc.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montredc.S
new file mode 100644
index 00000000000..c023023f89b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montredc.S
@@ -0,0 +1,264 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery reduce, z := (x' / 2^{64p}) MOD m
+// Inputs x[n], m[k], p; output z[k]
+//
+//    extern void bignum_montredc
+//     (uint64_t k, uint64_t *z,
+//      uint64_t n, uint64_t *x, uint64_t *m, uint64_t p);
+//
+// Does a := (x' / 2^{64p}) mod m where x' = x if n <= p + k and in general
+// is the lowest (p+k) digits of x, assuming x' <= 2^{64p} * m. That is,
+// p-fold Montgomery reduction w.r.t. a k-digit modulus m giving a k-digit
+// answer.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = m, R9 = p
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = m, [RSP+48] = p
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montredc)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montredc)
+        .text
+
+ // We copy n into %r10 but it comes in in %rdx originally
+
+#define k %rdi
+#define z %rsi
+#define n %r10
+#define x %rcx
+#define m %r8
+#define p %r9
+
+// General temp, low part of product and mul input
+#define a %rax
+// General temp, High part of product
+#define b %rdx
+// Negated modular inverse
+#define w  (%rsp)
+// Inner loop counter
+#define j %rbx
+// Home for i'th digit or Montgomery multiplier
+#define d %rbp
+#define h %r11
+#define e %r12
+#define t %r13
+#define i %r14
+#define c %r15
+
+// Some more intuitive names for temp regs in initial word-level negmodinv.
+
+#define t1 %rbx
+#define t2 %r14
+
+#define ashort %eax
+#define cshort %r15d
+#define jshort %ebx
+
+
+S2N_BN_SYMBOL(bignum_montredc):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+        movq    64(%rsp), %r9
+#endif
+
+// Save registers and allocate space on stack for non-register variable w
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $8, %rsp
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_montredc_end
+
+// Move n input into its permanent home, since we need %rdx for multiplications
+
+        movq    %rdx, n
+
+// Compute word-level negated modular inverse w for m == m[0].
+
+        movq    (m), a
+
+        movq    a, t2
+        movq    a, t1
+        shlq    $2, t2
+        subq    t2, t1
+        xorq    $2, t1
+
+        movq    t1, t2
+        imulq   a, t2
+        movl    $2, ashort
+        addq    t2, a
+        addq    $1, t2
+
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, t1
+
+        movq    t1, w
+
+// Initialize z to the lowest k digits of the input, zero-padding if n < k.
+
+        movq    k, j
+        cmpq    k, n
+        cmovcq  n, j
+        xorq    i, i
+        testq   j, j
+        jz      bignum_montredc_padloop
+bignum_montredc_copyloop:
+        movq    (x,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    j, i
+        jc      bignum_montredc_copyloop
+
+        cmpq    k, i
+        jnc     bignum_montredc_initialized
+
+        xorq    j, j
+bignum_montredc_padloop:
+        movq    j, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_montredc_padloop
+
+bignum_montredc_initialized:
+        xorq    c, c
+
+// Now if p = 0 we just need the corrective tail, and even that is
+// only needed for the case when the input is exactly the modulus,
+// to maintain the <= 2^64p * n precondition
+
+        testq   p, p
+        jz      bignum_montredc_corrective
+
+// Outer loop, just doing a standard Montgomery reduction on z
+
+        xorq    i, i
+bignum_montredc_outerloop:
+        movq    (z), e
+        movq    w, d
+        imulq   e, d
+        movq    (m), a
+        mulq    d
+        addq    e, a // Will be zero but want the carry
+        movq    %rdx, h
+        movl    $1, jshort
+        movq    k, t
+        decq    t
+        jz      bignum_montredc_montend
+
+bignum_montredc_montloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (m,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, -8(z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    t
+        jnz     bignum_montredc_montloop
+
+bignum_montredc_montend:
+        adcq    c, h
+        movl    $0, cshort
+        adcq    $0, c
+
+        addq    i, j
+        cmpq    n, j
+        jnc     bignum_montredc_offtheend
+        movq    (x,j,8), a
+        addq    a, h
+        adcq    $0, c
+bignum_montredc_offtheend:
+        movq    h, -8(z,k,8)
+
+// End of outer loop.
+
+        incq    i
+        cmpq    p, i
+        jc      bignum_montredc_outerloop
+
+// Now do a comparison of (c::z) with (0::m) to set a final correction mask
+// indicating that (c::z) >= m and so we need to subtract m.
+
+bignum_montredc_corrective:
+
+        xorq    j, j
+        movq    k, n
+bignum_montredc_cmploop:
+        movq    (z,j,8), a
+        sbbq    (m,j,8), a
+        incq    j
+        decq    n
+        jnz     bignum_montredc_cmploop
+
+        sbbq    $0, c
+        sbbq    d, d
+        notq    d
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        xorq    e, e
+        xorq    j, j
+bignum_montredc_corrloop:
+        movq    (m,j,8), a
+        andq    d, a
+        negq    e
+        sbbq    a, (z,j,8)
+        sbbq    e, e
+        incq    j
+        cmpq    k, j
+        jc      bignum_montredc_corrloop
+
+bignum_montredc_end:
+        addq    $8, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montsqr.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montsqr.S
new file mode 100644
index 00000000000..f028239dd3d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montsqr.S
@@ -0,0 +1,248 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^{64k}) mod m
+// Inputs x[k], m[k]; output z[k]
+//
+//    extern void bignum_montsqr
+//      (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m);
+//
+// Does z := (x^2 / 2^{64k}) mod m, assuming x^2 <= 2^{64k} * m, which is
+// guaranteed in particular if x < m initially (the "intended" case).
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = m
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = m
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr)
+        .text
+
+// We copy x into %r9 but it comes in in %rdx originally
+
+#define k %rdi
+#define z %rsi
+#define x %r9
+#define m %rcx
+
+// General temp, low part of product and mul input
+#define a %rax
+// General temp, High part of product
+#define b %rdx
+// Negated modular inverse
+#define w %r8
+// Inner loop counter
+#define j %rbx
+// Home for i'th digit or Montgomery multiplier
+#define d %rbp
+#define h %r10
+#define e %r11
+#define n %r12
+#define i %r13
+#define c0 %r14
+#define c1 %r15
+
+// A temp reg in the initial word-level negmodinv.
+
+#define t2 %rdx
+
+#define ashort %eax
+#define jshort %ebx
+
+
+S2N_BN_SYMBOL(bignum_montsqr):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save registers
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// If k = 0 the whole operation is trivial
+
+        testq   k, k
+        jz      bignum_montsqr_end
+
+// Move x input into its permanent home, since we need %rdx for multiplications
+
+        movq    %rdx, x
+
+// Compute word-level negated modular inverse w for m == m[0].
+
+        movq    (m), a
+
+        movq    a, t2
+        movq    a, w
+        shlq    $2, t2
+        subq    t2, w
+        xorq    $2, w
+
+        movq    w, t2
+        imulq   a, t2
+        movl    $2, ashort
+        addq    t2, a
+        addq    $1, t2
+
+        imulq   a, w
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, w
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, w
+
+        imulq   t2, t2
+        movl    $1, ashort
+        addq    t2, a
+        imulq   a, w
+
+// Initialize the output c0::z to zero so we can then consistently add rows.
+// It would be a bit more efficient to special-case the zeroth row, but
+// this keeps the code slightly simpler.
+
+        xorq    i, i // Also initializes i for main loop
+        xorq    j, j
+bignum_montsqr_zoop:
+        movq    i, (z,j,8)
+        incq    j
+        cmpq    k, j
+        jc      bignum_montsqr_zoop
+
+        xorq    c0, c0
+
+// Outer loop pulling down digits d=x[i], multiplying by x and reducing
+
+bignum_montsqr_outerloop:
+
+// Multiply-add loop where we always have CF + previous high part h to add in.
+// Note that in general we do need yet one more carry in this phase and hence
+// initialize c1 with the top carry.
+
+        movq    (x,i,8), d
+        xorq    j, j
+        xorq    h, h
+        xorq    c1, c1
+        movq    k, n
+
+bignum_montsqr_maddloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (x,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, (z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    n
+        jnz     bignum_montsqr_maddloop
+        adcq    h, c0
+        adcq    c1, c1
+
+// Montgomery reduction loop, similar but offsetting writebacks
+
+        movq    (z), e
+        movq    w, d
+        imulq   e, d
+        movq    (m), a
+        mulq    d
+        addq    e, a // Will be zero but want the carry
+        movq    %rdx, h
+        movl    $1, jshort
+        movq    k, n
+        decq    n
+        jz      bignum_montsqr_montend
+
+bignum_montsqr_montloop:
+        adcq    (z,j,8), h
+        sbbq    e, e
+        movq    (m,j,8), a
+        mulq    d
+        subq    e, %rdx
+        addq    h, a
+        movq    a, -8(z,j,8)
+        movq    %rdx, h
+        incq    j
+        decq    n
+        jnz     bignum_montsqr_montloop
+
+bignum_montsqr_montend:
+        adcq    c0, h
+        adcq    $0, c1
+        movq    c1, c0
+        movq    h, -8(z,j,8)
+
+// End of outer loop.
+
+        incq    i
+        cmpq    k, i
+        jc      bignum_montsqr_outerloop
+
+// Now do a comparison of (c0::z) with (0::m) to set a final correction mask
+// indicating that (c0::z) >= m and so we need to subtract m.
+
+        xorq    j, j
+        movq    k, n
+bignum_montsqr_cmploop:
+        movq    (z,j,8), a
+        sbbq    (m,j,8), a
+        incq    j
+        decq    n
+        jnz     bignum_montsqr_cmploop
+
+        sbbq    $0, c0
+        sbbq    d, d
+        notq    d
+
+// Now do a masked subtraction of m for the final reduced result.
+
+        xorq    e, e
+        xorq    j, j
+bignum_montsqr_corrloop:
+        movq    (m,j,8), a
+        andq    d, a
+        negq    e
+        sbbq    a, (z,j,8)
+        sbbq    e, e
+        incq    j
+        cmpq    k, j
+        jc      bignum_montsqr_corrloop
+
+bignum_montsqr_end:
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mul.S
new file mode 100644
index 00000000000..060064a4c7d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mul.S
@@ -0,0 +1,156 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[m], y[n]; output z[k]
+//
+//    extern void bignum_mul
+//     (uint64_t k, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the "z := x * y" operation where x is m digits, y is n, result z is k.
+// Truncates the result in general unless k >= m + n
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul)
+        .text
+
+// These are actually right
+
+#define p %rdi
+#define z %rsi
+#define n %r8
+
+// These are not
+
+#define c %r15
+#define h %r14
+#define l %r13
+#define x %r12
+#define y %r11
+#define i %rbx
+#define k %r10
+#define m %rbp
+
+// These are always local scratch since multiplier result is in these
+
+#define a %rax
+#define d %rdx
+
+
+
+S2N_BN_SYMBOL(bignum_mul):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+        movq    64(%rsp), %r9
+#endif
+
+// We use too many registers, and also we need %rax:%rdx for multiplications
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    %rdx, m
+
+// If the result size is zero, do nothing
+// Note that even if either or both inputs has size zero, we can't
+// just give up because we at least need to zero the output array
+// If we did a multiply-add variant, however, then we could
+
+        testq   p, p
+        jz      bignum_mul_end
+
+// Set initial 2-part sum to zero (we zero c inside the body)
+
+        xorq    h, h
+        xorq    l, l
+
+// Otherwise do outer loop k = 0 ... k = p - 1
+
+        xorq    k, k
+
+bignum_mul_outerloop:
+
+// Zero our carry term first; we eventually want it and a zero is useful now
+// Set a =  max 0 (k + 1 - n), i = min (k + 1) m
+// This defines the range a <= j < i for the inner summation
+// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow
+// And since we want to increment it anyway, we might as well do it now
+
+        xorq    c, c // c = 0
+        incq    k               // k = k + 1
+
+        movq    k, a // a = k + 1
+        subq    n, a // a = k + 1 - n
+        cmovcq  c, a // a = max 0 (k + 1 - n)
+
+        movq    m, i // i = m
+        cmpq    m, k // CF <=> k + 1 < m
+        cmovcq  k, i // i = min (k + 1) m
+
+// Turn i into a loop count, and skip things if it's <= 0
+// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a]
+// and then launch into the main inner loop, postdecrementing i
+
+        movq    k, d
+        subq    i, d
+        subq    a, i
+        jbe     bignum_mul_innerend
+        leaq    (%rcx,a,8), x
+        leaq    -8(%r9,d,8), y
+
+bignum_mul_innerloop:
+        movq    (y,i,8), %rax
+        mulq      (x)
+        addq    $8, x
+        addq    %rax, l
+        adcq    %rdx, h
+        adcq    $0, c
+        decq    i
+        jnz     bignum_mul_innerloop
+
+bignum_mul_innerend:
+
+        movq    l, (z)
+        movq    h, l
+        movq    c, h
+        addq    $8, z
+
+        cmpq    p, k
+        jc      bignum_mul_outerloop
+
+bignum_mul_end:
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_muladd10.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_muladd10.S
new file mode 100644
index 00000000000..2215714cc31
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_muladd10.S
@@ -0,0 +1,79 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply bignum by 10 and add word: z := 10 * z + d
+// Inputs z[k], d; outputs function return (carry) and z[k]
+//
+//    extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d);
+//
+// Although typically the input d < 10, this is not actually required.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = d, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = d, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_muladd10)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_muladd10)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define d %rcx
+
+#define a %rax
+#define l %rax
+
+#define h %rdx
+#define i %r8
+#define ten %r9
+#define tenshort %r9d
+
+S2N_BN_SYMBOL(bignum_muladd10):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Move carry input to permanent home, and if k = 0 skip the main loop
+
+        movq    %rdx, d
+        testq   k, k
+        jz      bignum_muladd10_end
+
+// Simple loop
+
+        xorq    i, i
+        movl    $10, tenshort
+bignum_muladd10_loop:
+        movq    (z,i,8), a
+        mulq    ten
+        addq    d, l
+        movq    l, (z,i,8)
+        adcq    $0, h
+        movq    h, d
+        incq    i
+        cmpq    k, i
+        jc      bignum_muladd10_loop
+
+// Return the final carry
+
+bignum_muladd10_end:
+        movq    d, %rax
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux.S
new file mode 100644
index 00000000000..5ec5435ee14
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux.S
@@ -0,0 +1,68 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiplex/select z := x (if p nonzero) or z := y (if p zero)
+// Inputs p, x[k], y[k]; output z[k]
+//
+//    extern void bignum_mux
+//     (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y);
+//
+// It is assumed that all numbers x, y and z have the same size k digits.
+//
+// Standard x86-64 ABI: RDI = p, RSI = k, RDX = z, RCX = x, R8 = y
+// Microsoft x64 ABI:   RCX = p, RDX = k, R8 = z, R9 = x, [RSP+40] = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux)
+        .text
+
+#define b %rdi
+#define k %rsi
+#define z %rdx
+#define x %rcx
+#define y %r8
+#define i %r9
+#define a %rax
+
+
+
+S2N_BN_SYMBOL(bignum_mux):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+        testq   k, k
+        jz      bignum_mux_end                     // If length = 0 do nothing
+
+        xorq    i, i
+        negq    b                       // CF <=> (b != 0)
+bignum_mux_loop:
+        movq    (x,i,8), a
+        movq    (y,i,8), b
+        cmovncq b, a // CF ? a : b
+        movq    a, (z,i,8)
+        incq    i
+        decq    k
+        jnz     bignum_mux_loop
+bignum_mux_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux16.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux16.S
new file mode 100644
index 00000000000..c04a0f44eaa
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux16.S
@@ -0,0 +1,93 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Select element from 16-element table, z := xs[k*i]
+// Inputs xs[16*k], i; output z[k]
+//
+//    extern void bignum_mux16
+//     (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i);
+//
+// It is assumed that all numbers xs[16] and the target z have the same size k
+// The pointer xs is to a contiguous array of size 16, elements size-k bignums
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = xs, RCX = i
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = xs, R9 = i
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux16)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux16)
+        .text
+
+#define k %rdi
+#define z %rsi
+
+// These get moved from original registers
+
+#define x %rcx
+#define i %rax
+
+// Other registers
+
+#define a %rdx
+#define b %r8
+#define j %r9
+#define n %r10
+
+
+
+S2N_BN_SYMBOL(bignum_mux16):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+
+// Copy size into decrementable counter, or skip everything if k = 0
+
+        testq   k, k
+        jz      bignum_mux16_end                     // If length = 0 do nothing
+        movq    k, n
+
+// Multiply i by k so we can compare pointer offsets directly with it
+
+        movq    %rcx, %rax
+        movq    %rdx, %rcx
+        mulq    k
+
+// Main loop
+
+bignum_mux16_loop:
+        movq    (x), a
+        movq    k, j
+.rep 15
+        movq    (x,j,8), b
+        cmpq    i, j
+        cmoveq  b, a
+        addq    k, j
+.endr
+        movq    a, (z)
+        addq    $8, z
+        addq    $8, x
+        decq    n
+        jnz     bignum_mux16_loop
+
+bignum_mux16_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_negmodinv.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_negmodinv.S
new file mode 100644
index 00000000000..203a7ba5fbd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_negmodinv.S
@@ -0,0 +1,186 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negated modular inverse, z := (-1/x) mod 2^{64k}
+// Input x[k]; output z[k]
+//
+//    extern void bignum_negmodinv
+//     (uint64_t k, uint64_t *z, uint64_t *x);
+//
+// Assuming x is odd (otherwise nothing makes sense) the result satisfies
+//
+//       x * z + 1 == 0 (mod 2^{64 * k})
+//
+// but is not necessarily reduced mod x.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_negmodinv)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_negmodinv)
+        .text
+
+#define k %rdi
+#define z %rsi
+// Moved from initial location to free %rdx
+#define x %rcx
+
+#define a %rax
+#define d %rdx
+#define i %r8
+#define m %r9
+#define h %r10
+#define w %r11
+#define t %r12
+#define e %rbx
+
+#define ashort %eax
+#define ishort %r8d
+
+S2N_BN_SYMBOL(bignum_negmodinv):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+        pushq   %rbx
+        pushq   %r12
+
+// If k = 0 do nothing (actually we could have avoiding the pushes and pops)
+
+        testq   k, k
+        jz      bignum_negmodinv_end
+
+// Move the x pointer into its permanent home (%rdx is needed for muls)
+
+        movq    %rdx, x
+
+// Compute word-level negated modular inverse w for x[0].
+
+        movq    (x), a
+
+        movq    a, d
+        movq    a, w
+        shlq    $2, d
+        subq    d, w
+        xorq    $2, w
+
+        movq    w, d
+        imulq   a, d
+        movl    $2, ashort
+        addq    d, a
+        addq    $1, d
+
+        imulq   a, w
+
+        imulq   d, d
+        movl    $1, ashort
+        addq    d, a
+        imulq   a, w
+
+        imulq   d, d
+        movl    $1, ashort
+        addq    d, a
+        imulq   a, w
+
+        imulq   d, d
+        movl    $1, ashort
+        addq    d, a
+        imulq   a, w
+
+// Write that as lowest word of the output, then if k = 1 we're finished
+
+        movq    w, (z)
+        cmpq    $1, k
+        jz      bignum_negmodinv_end
+
+// Otherwise compute and write the other digits (1..k-1) of w * x + 1
+
+        movq    (x), a
+        xorq    h, h
+        mulq    w
+        addq    $1, a
+        adcq    d, h
+        movl    $1, ishort
+bignum_negmodinv_initloop:
+        movq    (x,i,8), a
+        mulq    w
+        addq    h, a
+        adcq    $0, d
+        movq    a, (z,i,8)
+        movq    d, h
+        incq    i
+        cmpq    k, i
+        jc      bignum_negmodinv_initloop
+
+// For simpler indexing, z := z + 8 and k := k - 1 per outer iteration
+// Then we can use the same index for x and for z and effective size k.
+//
+// But we also offset k by 1 so the "real" size is k + 1; after doing
+// the special zeroth bit we count with t through k more digits, so
+// getting k + 1 total as required.
+//
+// This lets us avoid some special cases inside the loop at the cost
+// of needing the additional "finale" tail for the final iteration
+// since we do one outer loop iteration too few.
+
+        subq    $2, k
+        jz      bignum_negmodinv_finale
+
+bignum_negmodinv_outerloop:
+        addq    $8, z
+
+        movq    (z), h
+        movq    w, m
+        imulq   h, m
+        movq    m, (z)
+        movq    (x), a
+        mulq    m
+        addq    h, a
+        adcq    $0, d
+        movq    d, h
+        movl    $1, ishort
+        movq    k, t
+ bignum_negmodinv_innerloop:
+        adcq    (z,i,8), h
+        sbbq    e, e
+        movq    (x,i,8), a
+        mulq    m
+        subq    e, d
+        addq    h, a
+        movq    a, (z,i,8)
+        movq    d, h
+        incq    i
+        decq    t
+        jnz     bignum_negmodinv_innerloop
+
+        decq    k
+        jnz     bignum_negmodinv_outerloop
+
+bignum_negmodinv_finale:
+        movq    8(z), a
+        imulq   w, a
+        movq    a, 8(z)
+
+bignum_negmodinv_end:
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_nonzero.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_nonzero.S
new file mode 100644
index 00000000000..2717367f5c9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_nonzero.S
@@ -0,0 +1,58 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignum for nonzero-ness x =/= 0
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x);
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_nonzero)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_nonzero)
+        .text
+
+#define a %rax
+#define k %rdi
+#define x %rsi
+
+S2N_BN_SYMBOL(bignum_nonzero):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+        xorq    a, a
+        testq   k, k
+        jz      bignum_nonzero_end
+
+bignum_nonzero_loop:
+        orq     -8(x,k,8), a
+        decq    k
+        jnz     bignum_nonzero_loop
+
+// Set a standard C condition based on whether a is nonzero
+
+        negq    a
+        sbbq    a, a
+        negq    a
+
+bignum_nonzero_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_normalize.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_normalize.S
new file mode 100644
index 00000000000..1056ed8305e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_normalize.S
@@ -0,0 +1,124 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Normalize bignum in-place by shifting left till top bit is 1
+// Input z[k]; outputs function return (bits shifted left) and z[k]
+//
+//    extern uint64_t bignum_normalize (uint64_t k, uint64_t *z);
+//
+// Given a k-digit bignum z, this function shifts it left by its number of
+// leading zero bits, to give result with top bit 1, unless the input number
+// was 0. The return is the same as the output of bignum_clz, i.e. the number
+// of bits shifted (nominally 64 * k in the case of zero input).
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_normalize)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_normalize)
+        .text
+
+#define k %rdi
+#define z %rsi
+
+// Return value, which we put in %rax to save a move or two
+
+#define r %rax
+
+// Other variables
+// Matters that c is RCX as CL=lo(c) is assumed in shifts
+
+#define b %r9
+#define c %rcx
+#define d %rdx
+#define i %r8
+#define j %r10
+
+#define dshort %edx
+
+
+S2N_BN_SYMBOL(bignum_normalize):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Initialize shift count r = 0 and i = k - 1 but return immediately if k = 0.
+// Otherwise load top digit c, but then if k = 1 skip the digitwise part
+
+        xorq    r, r
+        movq    k, i
+        subq    $1, i
+        jc      bignum_normalize_end
+        movq    (z,i,8), c
+        jz      bignum_normalize_bitpart
+
+// Do d rather stupid but constant-time digit normalization, conditionally
+// shifting left (k-1) times based on whether the top word is zero.
+// With careful binary striding this could be O(k*log(k)) instead of O(k^2)
+// while still retaining the constant-time style.
+
+bignum_normalize_normloop:
+        xorq    j, j
+        movq    k, b
+        movq    r, d
+        incq    r
+        negq    c
+        cmovneq d, r
+        movl    $0, dshort
+bignum_normalize_shufloop:
+        movq    d, c
+        movq    (z,j,8), d
+        cmovcq  d, c
+        movq    c, (z,j,8)
+        incq    j
+        decq    b
+        jnz     bignum_normalize_shufloop
+        decq    i
+        jnz     bignum_normalize_normloop
+
+// We now have the top digit nonzero, assuming the input was nonzero,
+// and as per the invariant of the loop above, c holds that digit. So
+// now just count c's leading zeros and shift z bitwise that many bits.
+// We need to patch the bsr result for the undefined case of zero input
+
+bignum_normalize_bitpart:
+        movl    $127, dshort
+        bsrq    c, c
+        cmovzq  d, c
+        xorq    $63, c
+
+        shlq    $6, r
+        addq    c, r
+
+        xorq    b, b
+        xorq    i, i
+bignum_normalize_bitloop:
+        movq    (z,i,8), d
+        movq    d, j
+        shldq   %cl, b, d
+        movq    d, (z,i,8)
+        movq    j, b
+        incq    i
+        cmpq    k, i
+        jc      bignum_normalize_bitloop
+
+ bignum_normalize_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_odd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_odd.S
new file mode 100644
index 00000000000..81caa3a8bde
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_odd.S
@@ -0,0 +1,52 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Test bignum for odd-ness
+// Input x[k]; output function return
+//
+//    extern uint64_t bignum_odd (uint64_t k, uint64_t *x);
+//
+// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_odd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_odd)
+        .text
+
+S2N_BN_SYMBOL(bignum_odd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Set default return value of 0 and finish if k = 0 (trivially not odd)
+
+        xorl    %eax, %eax
+        testq   %rdi, %rdi
+        jz      bignum_odd_end
+
+// Otherwise return lowest bit of the input
+
+        movl    $1, %eax
+        andq    (%rsi), %rax
+
+bignum_odd_end:
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_of_word.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_of_word.S
new file mode 100644
index 00000000000..cb7c794979b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_of_word.S
@@ -0,0 +1,69 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert single digit to bignum, z := n
+// Input n; output z[k]
+//
+//    extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n);
+//
+// Create a k-digit (digit=64 bits) bignum at z with value n (mod 2^k)
+// where n is a word. The "mod 2^k" only matters in the degenerate k = 0 case.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_of_word)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_of_word)
+        .text
+
+S2N_BN_SYMBOL(bignum_of_word):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// If k = 0 do nothing
+
+        testq   %rdi, %rdi
+        jz      bignum_of_word_end
+
+bignum_of_word_nontrivial:
+
+// Write lowest word and jump to end if k = 1
+
+        movq    %rdx, (%rsi)
+        decq    %rdi
+        jz      bignum_of_word_end
+
+// Zero %rdx and write it to all z[i] for i = k-1 down to 1
+// It's a bit more compact to iterate "high to low" like this.
+// But at the cost of bumping up %rsi by lea %rsi, [%rsi+8]
+// each time round the loop (which also modifies one more reg)
+// we could go "low to high" if it helps with prefetch etc.
+
+        xorq    %rdx, %rdx
+bignum_of_word_loop:
+        movq    %rdx, (%rsi,%rdi,8)
+        decq    %rdi
+        jnz     bignum_of_word_loop
+
+bignum_of_word_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optadd.S
new file mode 100644
index 00000000000..90aa07e0a44
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optadd.S
@@ -0,0 +1,92 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally add, z := x + y (if p nonzero) or z := x (if p zero)
+// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_optadd
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+//
+// It is assumed that all numbers x, y and z have the same size k digits.
+// Returns carry-out as per usual addition, always 0 if p was zero.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = p, R8 = y, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = p, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optadd)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define x %rdx
+#define p %rcx
+#define y %r8
+
+#define c %rax
+#define i %r9
+#define b %r10
+#define a %r11
+
+
+S2N_BN_SYMBOL(bignum_optadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Initialize top carry to zero in all cases (also return value)
+
+        xorq    c, c
+
+// If k = 0 do nothing
+
+        testq   k, k
+        jz      bignum_optadd_end
+
+// Convert the nonzero/zero status of p into an all-1s or all-0s mask
+
+        negq    p
+        sbbq    p, p
+
+// Now go round the loop for i=0...k-1, saving the carry in c each iteration
+
+        xorq    i, i
+bignum_optadd_loop:
+        movq    (x,i,8), a
+        movq    (y,i,8), b
+        andq    p, b
+        negq    c
+        adcq    b, a
+        sbbq    c, c
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_optadd_loop
+
+// Return top carry
+
+        negq    %rax
+
+bignum_optadd_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optneg.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optneg.S
new file mode 100644
index 00000000000..288c887f028
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optneg.S
@@ -0,0 +1,89 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate, z := -x (if p nonzero) or z := x (if p zero)
+// Inputs p, x[k]; outputs function return (nonzero input) and z[k]
+//
+//    extern uint64_t bignum_optneg
+//     (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x);
+//
+// It is assumed that both numbers x and z have the same size k digits.
+// Returns a carry, which is equivalent to "x is nonzero".
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = p, RCX = x, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = p, R9 = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define p %rdx
+#define x %rcx
+
+#define c %rax
+#define a %r8
+#define i %r9
+
+#define cshort %eax
+
+S2N_BN_SYMBOL(bignum_optneg):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// If k = 0 do nothing, but need to set zero return for the carry (c = %rax)
+
+        xorq    c, c
+        testq   k, k
+        jz      bignum_optneg_end
+
+// Convert p into a strict bitmask and set initial carry-in in c
+
+        negq    p
+        sbbq    p, p
+        subq    p, c
+
+// Main loop
+
+        xorq    i, i
+bignum_optneg_loop:
+
+        movq    (x,i,8), a
+        xorq    p, a
+        addq    c, a
+        movl    $0, cshort
+        movq    a, (z,i,8)
+        adcq    $0, c
+        incq    i
+        cmpq    k, i
+        jc      bignum_optneg_loop
+
+// Return carry flag, fixing up inversion for negative case
+
+        xorq    p, %rax
+        andq    $1, %rax
+
+bignum_optneg_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsub.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsub.S
new file mode 100644
index 00000000000..29a716b7bb0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsub.S
@@ -0,0 +1,92 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
+// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_optsub
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+//
+// It is assumed that all numbers x, y and z have the same size k digits.
+// Returns carry-out as per usual subtraction, always 0 if p was zero.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = p, R8 = y, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = p, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optsub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optsub)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define x %rdx
+#define p %rcx
+#define y %r8
+
+#define i %r9
+#define b %r10
+#define c %rax
+#define a %r11
+
+
+S2N_BN_SYMBOL(bignum_optsub):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Initialize top carry to zero in all cases (also return value)
+
+        xorq    c, c
+
+// If k = 0 do nothing
+
+        testq   k, k
+        jz      bignum_optsub_end
+
+// Convert the nonzero/zero status of p into an all-1s or all-0s mask
+
+        negq    p
+        sbbq    p, p
+
+// Now go round the loop for i=0...k-1, saving the carry in c each iteration
+
+        xorq    i, i
+bignum_optsub_loop:
+        movq    (x,i,8), a
+        movq    (y,i,8), b
+        andq    p, b
+        negq    c
+        sbbq    b, a
+        sbbq    c, c
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_optsub_loop
+
+// Return top carry
+
+        negq    %rax
+
+bignum_optsub_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsubadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsubadd.S
new file mode 100644
index 00000000000..051886d3070
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsubadd.S
@@ -0,0 +1,109 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed
+// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_optsubadd
+//     (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y);
+//
+// If p has top bit set (i.e. is negative as a signed int) return z := x - y
+// Else if p is nonzero (i.e. is positive as a signed int) return z := x + y
+// Otherwise (i.e. p is zero) return z := x
+//
+// Return in RDI = the top carry, which will be 0 or 1, and appropriate for
+// addition or subtraction respectively (and always zero for p = 0)
+//
+// 2^{64*k} * -carryout + z = x - y [for subtraction]
+// 2^{64*k} * carryout + z = x + y [for addition]
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = p, R8 = y, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = x, R9 = p, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optsubadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optsubadd)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define x %rdx
+#define p %rcx
+#define y %r8
+
+#define c %rax
+#define i %r9
+#define m %rcx
+#define q %r10
+#define a %r11
+
+
+S2N_BN_SYMBOL(bignum_optsubadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Initialize top carry to zero in all cases (also return value)
+
+        xorq    c, c
+
+// If k = 0 do nothing
+
+        testq   k, k
+        jz      bignum_optsubadd_end
+
+// Turn the input p into two bitmasks, m indicating to use the y input at
+// all (same register as p) and q indicating a sign-flip
+
+        movq    p, q
+        sarq    $63, q
+        negq    p
+        sbbq    m, m
+
+// Generate an initial carry-in for the negating case only to add 1; this
+// is because we are actually going to do complements of the words of y
+
+        movq    q, c
+
+// Now go round the loop for i=0...k-1, saving the carry in c each iteration
+
+        xorq    i, i
+bignum_optsubadd_loop:
+        movq    (y,i,8), a
+        xorq    q, a
+        andq    m, a
+        negq    c
+        adcq    (x,i,8), a
+        sbbq    c, c
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_optsubadd_loop
+
+// Return carry flag, fixing up inversion for negative case
+
+        xorq    q, %rax
+        negq    %rax
+
+bignum_optsubadd_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_pow2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_pow2.S
new file mode 100644
index 00000000000..0e0b0206b95
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_pow2.S
@@ -0,0 +1,81 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return bignum of power of 2, z := 2^n
+// Input n; output z[k]
+//
+//    extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n);
+//
+// The result is as usual mod 2^{64*k}, so will be zero if n >= 64*k.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_pow2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_pow2)
+        .text
+
+#define k %rdi
+#define z %rsi
+#define n %rdx
+
+#define i %rcx
+#define w %rax
+#define a %r8
+
+#define wshort %eax
+
+
+
+S2N_BN_SYMBOL(bignum_pow2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// If k = 0 do nothing
+
+        testq   k, k
+        jz      bignum_pow2_end
+
+// Create the index n at which to write the nonzero word and the word w itself
+// Note that the x86 manual explicitly says that shift counts are taken modulo
+// the datasize, so we don't need to mask the lower 6 bits of n ourselves.
+
+        movl    $1, wshort
+        movq    n, %rcx
+        shlq    %cl, w
+        shrq    $6, n
+
+// Now in a constant-time fashion set the n'th word to w and others to zero
+
+        xorq    i, i
+bignum_pow2_loop:
+        xorq    a, a
+        cmpq    n, i
+        cmovzq  w, a
+        movq    a, (z,i,8)
+        incq    i
+        cmpq    k, i
+        jc      bignum_pow2_loop
+
+bignum_pow2_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shl_small.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shl_small.S
new file mode 100644
index 00000000000..f2170a56d03
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shl_small.S
@@ -0,0 +1,124 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Shift bignum left by c < 64 bits z := x * 2^c
+// Inputs x[n], c; outputs function return (carry-out) and z[k]
+//
+//    extern uint64_t bignum_shl_small
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+//
+// Does the "z := x << c" operation where x is n digits, result z is p.
+// The shift count c is masked to 6 bits so it actually uses c' = c mod 64.
+// The return value is the "next word" of a p+1 bit result, if n <= p.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = c, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = c, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_shl_small)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_shl_small)
+        .text
+
+#define p %rdi
+#define z %rsi
+#define n %rdx
+
+// These get moved from their initial positions
+
+#define c %rcx
+#define x %r9
+
+// Other variables
+
+#define b %rax
+#define t %r8
+#define a %r10
+#define i %r11
+
+
+
+S2N_BN_SYMBOL(bignum_shl_small):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output.
+
+        cmpq    n, p
+        cmovcq  p, n
+
+// Initialize "previous word" carry b to zero and main index i also to zero.
+// Then just skip the main loop if n = 0
+
+        xorq    b, b
+        xorq    i, i
+
+        testq   n, n
+        jz      bignum_shl_small_tail
+
+// Reshuffle registers to put the shift count into CL
+
+        movq    %rcx, x
+        movq    %r8, c
+
+// Now the main loop
+
+bignum_shl_small_loop:
+        movq    (x,i,8), a
+        movq    a, t
+        shldq   %cl, b, a
+        movq    a, (z,i,8)
+        movq    t, b
+        incq    i
+        cmpq    n, i
+        jc      bignum_shl_small_loop
+
+// Shift the top word correspondingly. Using shld one more time is easier
+// than carefully producing a complementary shift with care over the zero case
+
+        xorq    a, a
+        shldq   %cl, b, a
+        movq    a, b
+
+// If we are at the end, finish, otherwise write carry word then zeros
+
+bignum_shl_small_tail:
+        cmpq    p, i
+        jnc     bignum_shl_small_end
+        movq    b, (z,i,8)
+        xorq    b, b
+        incq    i
+        cmpq    p, i
+        jnc     bignum_shl_small_end
+
+bignum_shl_small_tloop:
+        movq    b, (z,i,8)
+        incq    i
+        cmpq    p, i
+        jc      bignum_shl_small_tloop
+
+// Return, with RAX = b as the top word
+
+bignum_shl_small_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shr_small.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shr_small.S
new file mode 100644
index 00000000000..8224c650c19
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shr_small.S
@@ -0,0 +1,114 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Shift bignum right by c < 64 bits z := floor(x / 2^c)
+// Inputs x[n], c; outputs function return (bits shifted out) and z[k]
+//
+//    extern uint64_t bignum_shr_small
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c);
+//
+// Does the "z := x >> c" operation where x is n digits, result z is p.
+// The shift count c is masked to 6 bits so it actually uses c' = c mod 64.
+// The return value is the inout mod 2^c'.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = c, returns RAX
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = c, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_shr_small)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_shr_small)
+        .text
+
+#define p %rdi
+#define z %rsi
+#define n %rdx
+
+// These get moved from their initial positions
+
+#define c %rcx
+#define x %r9
+
+// Other variables
+
+#define b %rax
+#define t %r8
+#define a %r10
+
+#define ashort %r10d
+
+
+
+S2N_BN_SYMBOL(bignum_shr_small):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+#endif
+
+// Reshuffle registers to put the shift count into CL
+
+        movq    %rcx, x
+        movq    %r8, c
+
+// Set default carry-in word to 0, useful for other things too
+
+        xorq    b, b
+
+// First, if p > n then pad output on the left with p-n zeros
+
+        cmpq    p, n
+        jnc     bignum_shr_small_nopad
+bignum_shr_small_padloop:
+        decq    p
+        movq    b, (z,p,8)
+        cmpq    p, n
+        jc      bignum_shr_small_padloop
+bignum_shr_small_nopad:
+
+// We now know that p <= n. If in fact p < n let carry word = x[p] instead of 0
+
+        jz      bignum_shr_small_shiftstart
+        movq    (x,p,8), b
+bignum_shr_small_shiftstart:
+        testq   p, p
+        jz      bignum_shr_small_trivial
+
+// Now the main loop
+
+bignum_shr_small_loop:
+        movq    -8(x,p,8), a
+        movq    a, t
+        shrdq   %cl, b, a
+        movq    a, -8(z,p,8)
+        movq    t, b
+        decq    p
+        jnz     bignum_shr_small_loop
+
+// Mask the carry word and return with that as RAX = b
+
+bignum_shr_small_trivial:
+        movl    $1, ashort
+        shlq    %cl, a
+        decq    a
+        andq    a, b
+
+bignum_shr_small_end:
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sqr.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sqr.S
new file mode 100644
index 00000000000..916f22a8a4b
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sqr.S
@@ -0,0 +1,186 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square z := x^2
+// Input x[n]; output z[k]
+//
+//    extern void bignum_sqr
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+//
+// Does the "z := x^2" operation where x is n digits and result z is k.
+// Truncates the result in general unless k >= 2 * n
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr)
+        .text
+
+// First three are where arguments come in, but n is moved.
+
+#define p %rdi
+#define z %rsi
+#define x %rcx
+#define n %r8
+
+// These are always local scratch since multiplier result is in these
+
+#define a %rax
+#define d %rdx
+
+// Other variables
+
+#define i %rbx
+#define ll %rbp
+#define hh %r9
+#define k %r10
+#define y %r11
+#define htop %r12
+#define l %r13
+#define h %r14
+#define c %r15
+
+// Short versions
+
+#define llshort %ebp
+
+S2N_BN_SYMBOL(bignum_sqr):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// We use too many registers, and also we need %rax:%rdx for multiplications
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    %rdx, n
+
+// If p = 0 the result is trivial and nothing needs doing
+
+        testq   p, p
+        jz      bignum_sqr_end
+
+// initialize (hh,ll) = 0
+
+        xorl    llshort, llshort
+        xorq    hh, hh
+
+// Iterate outer loop from k = 0 ... k = p - 1 producing result digits
+
+        xorq    k, k
+
+bignum_sqr_outerloop:
+
+// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
+// We want to accumulate all x[i] * x[k - i] for bot <= i < top
+// For the optimization of squaring we avoid duplication and do
+// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n
+// Initialize i = bot; in fact just compute bot as i directly.
+
+        xorq    c, c
+        leaq    1(k), i
+        movq    i, htop
+        shrq    $1, htop
+        subq    n, i
+        cmovcq  c, i
+        cmpq    n, htop
+        cmovncq n, htop
+
+// Initialize the three-part local sum (c,h,l); c was already done above
+
+        xorq    l, l
+        xorq    h, h
+
+// If htop <= bot then main doubled part of the sum is empty
+
+        cmpq    htop, i
+        jnc     bignum_sqr_nosumming
+
+// Use a moving pointer for [y] = x[k-i] for the cofactor
+
+        movq    k, a
+        subq    i, a
+        leaq    (x,a,8), y
+
+// Do the main part of the sum x[i] * x[k - i] for 2 * i < k
+
+bignum_sqr_innerloop:
+        movq    (x,i,8), a
+        mulq     (y)
+        addq    a, l
+        adcq    d, h
+        adcq    $0, c
+        subq    $8, y
+        incq    i
+        cmpq    htop, i
+        jc      bignum_sqr_innerloop
+
+// Now double it
+
+        addq    l, l
+        adcq    h, h
+        adcq    c, c
+
+// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
+
+bignum_sqr_nosumming:
+        testq   $1, k
+        jnz     bignum_sqr_innerend
+        cmpq    n, i
+        jnc     bignum_sqr_innerend
+
+        movq    (x,i,8), a
+        mulq    a
+        addq    a, l
+        adcq    d, h
+        adcq    $0, c
+
+// Now add the local sum into the global sum, store and shift
+
+bignum_sqr_innerend:
+        addq    ll, l
+        movq    l, (z,k,8)
+        adcq    hh, h
+        movq    h, ll
+        adcq    $0, c
+        movq    c, hh
+
+        incq    k
+        cmpq    p, k
+        jc      bignum_sqr_outerloop
+
+// Restore registers and return
+
+bignum_sqr_end:
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sub.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sub.S
new file mode 100644
index 00000000000..589b89500e2
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sub.S
@@ -0,0 +1,142 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract, z := x - y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+//
+//    extern uint64_t bignum_sub
+//     (uint64_t p, uint64_t *z,
+//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the z := x - y operation, truncating modulo p words in general and
+// returning a top borrow (0 or 1) in the p'th place, only subtracting input
+// words below p (as well as m and n respectively) to get the diff and borrow.
+//
+// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
+// Microsoft x64 ABI:   RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub)
+        .text
+
+#define p %rdi
+#define z %rsi
+#define m %rdx
+#define x %rcx
+#define n %r8
+#define y %r9
+#define i %r10
+#define a %rax
+
+#define ashort %eax
+
+
+
+S2N_BN_SYMBOL(bignum_sub):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        movq    56(%rsp), %r8
+        movq    64(%rsp), %r9
+#endif
+
+// Zero the main index counter for both branches
+
+        xorq    i, i
+
+// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
+// we'll never need words past the p'th. Can now assume m <= p and n <= p.
+// Then compare the modified m and n and branch accordingly
+
+        cmpq    m, p
+        cmovcq  p, m
+        cmpq    n, p
+        cmovcq  p, n
+        cmpq    n, m
+        jc      bignum_sub_ylonger
+
+// The case where x is longer or of the same size (p >= m >= n)
+
+        subq    m, p
+        subq    n, m
+        incq    m
+        testq   n, n
+        jz      bignum_sub_xtest
+bignum_sub_xmainloop:
+        movq    (x,i,8), a
+        sbbq    (y,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        decq    n
+        jnz     bignum_sub_xmainloop
+        jmp     bignum_sub_xtest
+bignum_sub_xtoploop:
+        movq    (x,i,8), a
+        sbbq    $0, a
+        movq    a, (z,i,8)
+        incq    i
+bignum_sub_xtest:
+        decq    m
+        jnz     bignum_sub_xtoploop
+        sbbq    a, a
+        testq   p, p
+        jz      bignum_sub_tailskip
+bignum_sub_tailloop:
+        movq    a, (z,i,8)
+        incq    i
+        decq    p
+        jnz     bignum_sub_tailloop
+bignum_sub_tailskip:
+        negq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+// The case where y is longer (p >= n > m)
+
+bignum_sub_ylonger:
+
+        subq    n, p
+        subq    m, n
+        testq   m, m
+        jz      bignum_sub_ytoploop
+bignum_sub_ymainloop:
+        movq    (x,i,8), a
+        sbbq    (y,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        decq    m
+        jnz     bignum_sub_ymainloop
+bignum_sub_ytoploop:
+        movl    $0, ashort
+        sbbq    (y,i,8), a
+        movq    a, (z,i,8)
+        incq    i
+        decq    n
+        jnz     bignum_sub_ytoploop
+        sbbq    a, a
+        testq   p, p
+        jnz     bignum_sub_tailloop
+        negq    a
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_bytereverse.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_bytereverse.S
new file mode 100644
index 00000000000..8f22655043a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_bytereverse.S
@@ -0,0 +1,41 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reverse the order of bytes in a 64-bit word
+//
+//    extern uint64_t word_bytereverse (uint64_t a);
+//
+// Standard x86-64 ABI: RDI = a, returns RAX
+// Microsoft x64 ABI:   RCX = a, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_bytereverse)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_bytereverse)
+        .text
+
+// Just uses the x86 BSWAP instruction, which does the job directly
+
+S2N_BN_SYMBOL(word_bytereverse):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+#endif
+
+        movq    %rdi, %rax
+        bswapq  %rax
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_clz.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_clz.S
new file mode 100644
index 00000000000..8b613fc4194
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_clz.S
@@ -0,0 +1,49 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count leading zero bits in a single word
+// Input a; output function return
+//
+//    extern uint64_t word_clz (uint64_t a);
+//
+// Standard x86-64 ABI: RDI = a, returns RAX
+// Microsoft x64 ABI:   RCX = a, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_clz)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_clz)
+        .text
+
+S2N_BN_SYMBOL(word_clz):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+#endif
+
+// First do %rax = 63 - bsr(a), which is right except (maybe) for zero inputs
+
+        bsrq    %rdi, %rax
+        xorq    $63, %rax
+
+// Force return of 64 in the zero-input case
+
+        movl    $64, %edx
+        testq   %rdi, %rdi
+        cmoveq  %rdx, %rax
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_ctz.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_ctz.S
new file mode 100644
index 00000000000..be1db1491fb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_ctz.S
@@ -0,0 +1,48 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count trailing zero bits in a single word
+// Input a; output function return
+//
+//    extern uint64_t word_ctz (uint64_t a);
+//
+// Standard x86-64 ABI: RDI = a, returns RAX
+// Microsoft x64 ABI:   RCX = a, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_ctz)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_ctz)
+        .text
+
+S2N_BN_SYMBOL(word_ctz):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+#endif
+
+// First just do %rax = bsf(a), which is right except (maybe) for zero inputs
+
+        bsfq    %rdi, %rax
+
+// Force return of 64 in the zero-input case
+
+        movl    $64, %edx
+        testq   %rdi, %rdi
+        cmoveq  %rdx, %rax
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_divstep59.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_divstep59.S
new file mode 100644
index 00000000000..139c83b7a5f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_divstep59.S
@@ -0,0 +1,402 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Perform 59 "divstep" iterations and return signed matrix of updates
+// Inputs d, f, g; output m[2][2] and function return
+//
+// extern int64_t word_divstep59
+//  (int64_t m[2][2],int64_t d,uint64_t f,uint64_t g);
+//
+// Standard x86-64 ABI: RDI = m, RSI = d, RDX = f, RCX = g, returns RAX
+// Microsoft x64 ABI:   RCX = m, RDX = d, R8 = f, R9 = g, returns RAX
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_divstep59)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_divstep59)
+        .text
+
+#define mat %rdi
+
+#define d %rsi
+#define fuv %rbx
+#define grs %rcx
+
+#define f %r12
+#define g %r13
+
+#define m %r8
+#define t %r9
+
+#define zero %rbp
+#define zeroe %ebp
+#define minus2 %rax
+#define minus2e %eax
+#define plus2 %rdx
+#define plus2e %edx
+
+#define m00 %r8
+#define m01 %r9
+#define m10 %r10
+#define m11 %r11
+
+S2N_BN_SYMBOL(word_divstep59):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+
+// Save extra registers
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+
+// Pack f and g into single registers with (negated) update matrices,
+// initially the identity matrix. The f_lo and g_lo are initially
+// the 20 lowest bits of f and g.
+//
+// fuv = f_lo - 2^41 * 1 - 2^62 * 0
+// grs = g_lo - 2^41 * 0 - 2^62 * 1
+
+        movq    %rdx, fuv
+        movq    %rdx, f
+        andq    $0xFFFFF, fuv
+        movq    $0xFFFFFE0000000000, %rax
+        orq     %rax, fuv
+
+        movq    %rcx, g
+        andq    $0xFFFFF, grs
+        movq    $0xc000000000000000, %rax
+        orq     %rax, grs
+
+// Now do 20 divsteps on that packed format.
+//
+// At the i'th iteration (starting at i = 0, ending at i = 20)
+// the intermediate packed values are of the form
+//
+// fuv = f_lo - 2^{41-i} * m00 - 2^{62-i} * m01
+// grs = g_lo - 2^{41-i} * m10 - 2^{62-i} * m11
+//
+// where the following matrix indicates the updates to apply
+// to the original (full-sized) f and g for those iterations.
+//
+// [m00 m01] * [f_0] = [f_i]
+// [m10 m11]   [g_0]   [g_i]
+
+        movq    $-2, minus2
+        xorl    zeroe, zeroe
+        movl    $2, plus2e
+        movq    fuv, t
+        movq    minus2, m
+        testq   d, d
+        cmovs   zero, m
+        testq   $1, grs
+.set i, 0
+.rep 20
+        cmovzq zero, m
+        cmovzq zero, t
+.if (i != 0)
+        sarq    $1, grs
+.endif
+        xorq    m, t
+        xorq    m, d
+        btq     $63, m
+        cmovcq  grs, fuv
+        movq    minus2, m
+        addq    plus2, d
+        leaq    (grs,t), grs
+.if (i != 19)
+        cmovs   zero, m
+        movq    fuv, t
+        testq   plus2, grs
+.endif
+.set i, (i+1)
+.endr
+        sarq    $1, grs
+
+// Extract the matrix entries, but keep them in negated form.
+// Store them in the output buffer temporarily.
+
+        movl    $1048576, %eax
+        leaq    (fuv,%rax), m00
+        leaq    (grs,%rax), m10
+        shlq    $22, m00
+        shlq    $22, m10
+        sarq    $43, m00
+        sarq    $43, m10
+
+        movq    $2199024304128, %rax
+        leaq    (fuv,%rax), m01
+        leaq    (grs,%rax), m11
+        sarq    $42, m01
+        sarq    $42, m11
+
+        movq    m00, (mat)
+        movq    m01, 8(mat)
+        movq    m10, 16(mat)
+        movq    m11, 24(mat)
+
+// Compute updated f and g using the negated matrix entries;
+// this flips the signs of f and g but it doesn't matter.
+//
+//   f = (m00 * f + m01 * g) / 2^20
+//   g = (m10 * f + m11 * g) / 2^20
+//
+// Since we only need another 40 bits, we can do all of that
+// computation naively using (implicitly signed) 64-bit words.
+
+        imulq   f, m10
+        imulq   m00, f
+        imulq   g, m01
+        imulq   m11, g
+        addq    m01, f
+        addq    m10, g
+        sarq    $20, f
+        sarq    $20, g
+
+// Re-pack for 20 more rounds
+
+        movq    f, fuv
+        andq    $0xFFFFF, fuv
+        movq    $0xFFFFFE0000000000, %rax
+        orq     %rax, fuv
+
+        movq    g, grs
+        andq    $0xFFFFF, grs
+        movq    $0xc000000000000000, %rax
+        orq     %rax, grs
+
+// Second block of 20 divsteps in the same style
+
+        movq    $-2, minus2
+        movl    $2, plus2e
+        movq    fuv, t
+        movq    minus2, m
+        testq   d, d
+        cmovs   zero, m
+        testq   $1, grs
+.set i, 0
+.rep 20
+        cmovzq zero, m
+        cmovzq zero, t
+.if (i != 0)
+        sarq    $1, grs
+.endif
+        xorq    m, t
+        xorq    m, d
+        btq     $63, m
+        cmovcq  grs, fuv
+        movq    minus2, m
+        addq    plus2, d
+        leaq    (grs,t), grs
+.if (i != 19)
+        cmovs   zero, m
+        movq    fuv, t
+        testq   plus2, grs
+.endif
+.set i, (i+1)
+.endr
+        sarq    $1, grs
+
+// Extract the next matrix entries, in negated form again
+
+        movl    $1048576, %eax
+        leaq    (fuv,%rax), m00
+        leaq    (grs,%rax), m10
+        shlq    $22, m00
+        shlq    $22, m10
+        sarq    $43, m00
+        sarq    $43, m10
+
+        movq    $2199024304128, %rax
+        leaq    (fuv,%rax), m01
+        leaq    (grs,%rax), m11
+        sarq    $42, m01
+        sarq    $42, m11
+
+// Compute updated f and g using the negated matrix entries,
+// and so again flipping (thus actually restoring) the signs.
+//
+//   f = (n00 * f + n01 * g) / 2^20
+//   g = (n10 * f + n11 * g) / 2^20
+
+        movq    g, fuv
+        movq    f, grs
+        imulq   m00, f
+        imulq   m01, fuv
+        addq    fuv, f
+        imulq   m11, g
+        imulq   m10, grs
+        addq    grs, g
+        sarq    $20, f
+        sarq    $20, g
+
+// Re-pack for 20 more rounds
+
+        movq    f, fuv
+        andq    $0xFFFFF, fuv
+        movq    $0xFFFFFE0000000000, %rax
+        orq     %rax, fuv
+
+        movq    g, grs
+        andq    $0xFFFFF, grs
+        movq    $0xc000000000000000, %rax
+        orq     %rax, grs
+
+// Multiply the first two matrices, and re-store in the output buffer.
+//
+// [m00_new  m01_new] = [m00  m01] * [m00_prev  m01_prev]
+// [m10_new  m11_new]   [m10  m11]   [m10_prev  m11_prev]
+//
+// The resulting matrix entries are:
+//
+//   m00_new = m00 * m00_prev + m01 * m10_prev
+//   m01_new = m00 * m01_prev + m01 * m11_prev
+//   m10_new = m10 * m00_prev + m11 * m10_prev
+//   m11_new = m10 * m01_prev + m11 * m11_prev
+//
+// At this point the sign is right since both matrices were negated.
+
+        movq    (mat), %rax
+        imulq   m00, %rax
+        movq    16(mat), %rdx
+        imulq   m01, %rdx
+        imulq   8(mat), m00
+        imulq   24(mat), m01
+        addq    m00, m01
+        leaq    (%rax,%rdx), m00
+
+        movq    (mat), %rax
+        imulq   m10, %rax
+        movq    16(mat), %rdx
+        imulq   m11, %rdx
+        imulq   8(mat), m10
+        imulq   24(mat), m11
+        addq    m10, m11
+        leaq    (%rax,%rdx), m10
+
+        movq    m00, (mat)
+        movq    m01, 8(mat)
+        movq    m10, 16(mat)
+        movq    m11, 24(mat)
+
+// Third block of divsteps, same style but a total of 19 not 20
+
+        movq    $-2, minus2
+        movl    $2, plus2e
+        movq    fuv, t
+        movq    minus2, m
+        testq   d, d
+        cmovs   zero, m
+        testq   $1, grs
+.set i, 0
+.rep 19
+        cmovzq zero, m
+        cmovzq zero, t
+.if (i != 0)
+        sarq    $1, grs
+.endif
+        xorq    m, t
+        xorq    m, d
+        btq     $63, m
+        cmovcq  grs, fuv
+        movq    minus2, m
+        addq    plus2, d
+        leaq    (grs,t), grs
+.if (i != 18)
+        cmovs   zero, m
+        movq    fuv, t
+        testq   plus2, grs
+.endif
+.set i, (i+1)
+.endr
+        sarq    $1, grs
+
+// Extract the matrix entries from the final 19 divsteps
+
+        movl    $1048576, %eax
+        leaq    (fuv,%rax), m00
+        leaq    (grs,%rax), m10
+        shlq    $21, m00
+        shlq    $21, m10
+        sarq    $43, m00
+        sarq    $43, m10
+
+        movq    $2199024304128, %rax
+        leaq    (fuv,%rax), m01
+        leaq    (grs,%rax), m11
+        sarq    $43, m01
+        sarq    $43, m11
+
+// Multiply by this new matrix
+//
+// [m00_new  m01_new] = [m00  m01] * [m00_prev  m01_prev]
+// [m10_new  m11_new]   [m10  m11]   [m10_prev  m11_prev]
+//
+// The resulting matrix entries are:
+//
+//   m00_new = m00 * m00_prev + m01 * m10_prev
+//   m01_new = m00 * m01_prev + m01 * m11_prev
+//   m10_new = m10 * m00_prev + m11 * m10_prev
+//   m11_new = m10 * m01_prev + m11 * m11_prev
+//
+// Since we didn't negate the n matrix, all products are negated
+// and so we insert negations
+
+        movq    (mat), %rax
+        imulq   m00, %rax
+        movq    16(mat), %rdx
+        imulq   m01, %rdx
+        imulq   8(mat), m00
+        imulq   24(mat), m01
+        addq    m00, m01
+        leaq    (%rax,%rdx), m00
+        negq    m01
+        negq    m00
+
+        movq    (mat), %rax
+        imulq   m10, %rax
+        movq    16(mat), %rdx
+        imulq   m11, %rdx
+        imulq   8(mat), m10
+        imulq   24(mat), m11
+        addq    m10, m11
+        leaq    (%rax,%rdx), m10
+        negq    m11
+        negq    m10
+
+// Now write back the final matrix and d for the whole 59 steps
+
+        movq    m00, (mat)
+        movq    m01, 8(mat)
+        movq    m10, 16(mat)
+        movq    m11, 24(mat)
+        movq    d, %rax
+
+// Restore registers and return
+
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_max.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_max.S
new file mode 100644
index 00000000000..020be639129
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_max.S
@@ -0,0 +1,45 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return maximum of two unsigned 64-bit words
+// Inputs a, b; output function return
+//
+//    extern uint64_t word_max (uint64_t a, uint64_t b);
+//
+// Standard x86-64 ABI: RDI = a, RSI = b, returns RAX
+// Microsoft x64 ABI:   RCX = a, RDX = b, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_max)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_max)
+        .text
+
+#define a %rdi
+#define b %rsi
+
+S2N_BN_SYMBOL(word_max):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+        movq    a, %rax
+        cmpq    b, a
+        cmovcq  b, %rax
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_min.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_min.S
new file mode 100644
index 00000000000..9944383c822
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_min.S
@@ -0,0 +1,45 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Return minimum of two unsigned 64-bit words
+// Inputs a, b; output function return
+//
+//    extern uint64_t word_min (uint64_t a, uint64_t b);
+//
+// Standard x86-64 ABI: RDI = a, RSI = b, returns RAX
+// Microsoft x64 ABI:   RCX = a, RDX = b, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_min)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_min)
+        .text
+
+#define a %rdi
+#define b %rsi
+
+S2N_BN_SYMBOL(word_min):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+        movq    a, %rax
+        cmpq    b, a
+        cmovncq b, %rax
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_negmodinv.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_negmodinv.S
new file mode 100644
index 00000000000..f8b9598597d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_negmodinv.S
@@ -0,0 +1,76 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Single-word negated modular inverse (-1/a) mod 2^64
+// Input a; output function return
+//
+//    extern uint64_t word_negmodinv (uint64_t a);
+//
+// A 64-bit function that returns a negated multiplicative inverse mod 2^64
+// of its input, assuming that input is odd. Given odd input a, the result z
+// will satisfy a * z + 1 == 0 (mod 2^64), i.e. a 64-bit word multiplication
+// a * z will give -1.
+//
+// Standard x86-64 ABI: RDI = a, returns RAX
+// Microsoft x64 ABI:   RCX = a, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_negmodinv)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_negmodinv)
+        .text
+
+S2N_BN_SYMBOL(word_negmodinv):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+#endif
+
+// Initial magical 5-bit approximation x = (a - a<<2) xor 2
+
+        movq    %rdi, %rcx
+        movq    %rdi, %rax
+        shlq    $2, %rcx
+        subq    %rcx, %rax
+        xorq    $2, %rax
+
+// Now refine to 64-bit congruence
+
+        movq    %rax, %rcx // %rcx = x
+        imulq   %rdi, %rcx // %rcx = a * x
+        movl    $2, %edx
+        addq    %rcx, %rdx // %rdx = 1 + e = 2 + a * x
+        addq    $1, %rcx // %rcx = e = a * x + 1
+
+        imulq   %rdx, %rax // %rax = x * (1 + e)
+
+        imulq   %rcx, %rcx // %rcx = e^2
+        movl    $1, %edx
+        addq    %rcx, %rdx
+        imulq   %rdx, %rax // %rax = x * (1 + e) * (1 + e^2)
+
+        imulq   %rcx, %rcx // %rcx = e^4
+        movl    $1, %edx
+        addq    %rcx, %rdx
+        imulq   %rdx, %rax // %rax = x * (1 + e) * (1 + e^2) * (1 + e^4)
+
+        imulq   %rcx, %rcx // %rcx = e^8
+        movl    $1, %edx
+        addq    %rcx, %rdx
+        imulq   %rdx, %rax // %rax = x * (1 + e) * ... * * (1 + e^8)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_popcount.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_popcount.S
new file mode 100644
index 00000000000..9647b2cc862
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_popcount.S
@@ -0,0 +1,70 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Count number of set bits in a single 64-bit word (population count)
+// Input a; output function return
+//
+//    extern uint64_t word_popcount (uint64_t a);
+//
+// Standard x86-64 ABI: RDI = a, returns RAX
+// Microsoft x64 ABI:   RCX = a, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_popcount)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_popcount)
+        .text
+
+S2N_BN_SYMBOL(word_popcount):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+#endif
+
+// The code is generated by gcc -O3 (version 11.4.0) from
+//
+// uint64_t word_popcount(uint64_t x)
+// { uint64_t x2  = x - ((x & UINT64_C(0xAAAAAAAAAAAAAAAA))>>1);
+//   uint64_t x4  = (x2 & UINT64_C(0x3333333333333333)) +
+//                  ((x2 & UINT64_C(0xCCCCCCCCCCCCCCCC))>>2);
+//   uint64_t x8  = (x4 + (x4>>4)) & UINT64_C(0x0F0F0F0F0F0F0F0F);
+//   uint64_t x64 = x8 * UINT64_C(0x101010101010101);
+//   uint64_t y = x64>>56;
+//  return y;
+// }
+
+        movabsq $0x5555555555555555, %rdx
+        movq    %rdi, %rax
+        shrq    $1, %rax
+        andq    %rdx, %rax
+        subq    %rax, %rdi
+        movabsq $0x3333333333333333, %rax
+        movq    %rdi, %rdx
+        andq    %rax, %rdi
+        shrq    $0x2, %rdx
+        andq    %rax, %rdx
+        addq    %rdi, %rdx
+        movq    %rdx, %rax
+        shrq    $0x4, %rax
+        addq    %rdx, %rax
+        movabsq $0xf0f0f0f0f0f0f0f, %rdx
+        andq    %rdx, %rax
+        movabsq $0x101010101010101, %rdx
+        imulq   %rdx, %rax
+        shrq    $0x38, %rax
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_recip.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_recip.S
new file mode 100644
index 00000000000..dc2c0f91813
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_recip.S
@@ -0,0 +1,144 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Single-word reciprocal, underestimate of 2^128 / a with implicit 1 added
+// Input a; output function return
+//
+//    extern uint64_t word_recip (uint64_t a);
+//
+// Given an input word "a" with its top bit set (i.e. 2^63 <= a < 2^64), the
+// result "x" is implicitly augmented with a leading 1 giving x' = 2^64 + x.
+// The result is x' = ceil(2^128 / a) - 1, which except for the single
+// special case a = 2^63 is the same thing as x' = floor(2^128 / a).
+//
+// Standard x86-64 ABI: RDI = a, returns RAX
+// Microsoft x64 ABI:   RCX = a, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_recip)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(word_recip)
+        .text
+
+#define a %rdi
+#define x %rcx
+#define b %rsi
+
+# Some aliasing here
+
+#define t %rax
+#define l %rax
+
+#define d %rdx
+#define h %rdx
+
+S2N_BN_SYMBOL(word_recip):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+#endif
+
+// Scale the input down: b overestimates a/2^16 with b <= 2^48 and
+// x underestimates 2^64/b with b * x =~= 2^64, accurate to ~2 bits.
+
+        movq    a, b
+        movq    $0x1FFFFFFFFFFFF, x
+        shrq    $16, b
+        xorq    b, x
+        incq    b
+        shrq    $32, x
+
+// Suppose x = 2^64/b * (1 - e). and get scaled error d = 2^64 * e
+
+        movq    b, d
+        imulq   x, d
+        negq    d
+
+// Rescale to give c = 2^15 * e (so c <= 2^13) and compute
+// e + e^2 + e^3 + e^4 = (1 + e^2) (e + e^2)
+// = (2^30 + c^2) * (2^15 * c + c^2) / 2^60
+// and then x * (1 + e + e^2 + e^3 + e^4)
+// = (2^30 * x + x * (2^30 + c^2) * (2^30 * c + c^2) / 2^30) / 2^30
+
+        movq    d, t
+        shrq    $49, t
+        imulq   t, t
+        shrq    $34, d
+        addq    t, d
+        orq     $0x40000000, t
+        imulq   d, t
+        shrq    $30, t
+        imulq   x, t
+        shlq    $30, x
+        addq    t, x
+        shrq    $30, x
+
+// Now b * x =~= 2^64, accurate to ~10 bits.
+// Do a 64-bit Newton step, scaling up x by 16 bits in the process.
+
+        movq    b, d
+        imulq   x, d
+        negq    d
+        shrq    $24, d
+        imulq   x, d
+        shlq    $16, x
+        shrq    $24, d
+        addq    d, x
+
+// Now b * x =~= 2^80, accurate to ~20 bits.
+// Do a 64-bit Newton step, scaling up x by 31 bits in the process
+
+        movq    b, d
+        imulq   x, d
+        negq    d
+        shrq    $32, d
+        imulq   x, d
+        shlq    $31, x
+        shrq    $17, d
+        addq    d, x
+
+// Now a * x =~= 2^127, accurate to ~40 bits. Do a Newton step at full size.
+// Instead of literally negating the product (h,l) we complement bits in
+// the extracted bitfield, which is close enough and a bit faster.
+// At the end we also shift x one more bit left, losing the known-1 top bit
+// so that a * (2^64 + x) =~= 2^128.
+
+        movq    a, l
+        mulq    x
+        shrdq   $60, h, l
+        movq    x, h
+        shrq    $33, h
+        notq    l
+        imulq   h, l
+        shlq    $1, x
+        shrq    $33, l
+        addq    l, x
+
+// Test if (x' + 1) * a < 2^128 where x' = 2^64 + x, catching the special
+// case where x + 1 would wrap, corresponding to input a = 2^63.
+
+        addq    $1, x
+        movq    a, l
+        sbbq    $0, x
+        mulq    x
+        movq    x, %rax
+        addq    a, h
+
+// Select either x or x + 1 accordingly as the final answer
+
+        sbbq    $0, %rax
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_add_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_add_p256.S
new file mode 100644
index 00000000000..76ab05b3430
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_add_p256.S
@@ -0,0 +1,100 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_add_p256
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+#define y %rdx
+
+#define d0 %rax
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %rdx
+#define c %r11
+
+#define n1short %r10d
+
+
+
+S2N_BN_SYMBOL(bignum_add_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Load and add the two inputs as 2^256 * c + [d3;d2;d1;d0] = x + y
+
+        xorq    c, c
+        movq    (x), d0
+        addq    (y), d0
+        movq    8(x), d1
+        adcq    8(y), d1
+        movq    16(x), d2
+        adcq    16(y), d2
+        movq    24(x), d3
+        adcq    24(y), d3
+        adcq    c, c
+
+// Now subtract 2^256 * c + [d3;d3;d1;d1] = x + y - p_256
+// The constants n1 and n3 in [n3; 0; n1; -1] = p_256 are saved for later
+
+        subq    $-1, d0
+        movl    $0x00000000ffffffff, n1short
+        sbbq    n1, d1
+        sbbq    $0, d2
+        movq    $0xffffffff00000001, n3
+        sbbq    n3, d3
+
+// Since by hypothesis x < p_256 we know x + y - p_256 < 2^256, so the top
+// carry c actually gives us a bitmask for x + y - p_256 < 0, which we
+// now use to make a masked p_256' = [n3; 0; n1; c]
+
+        sbbq    $0, c
+        andq    c, n1
+        andq    c, n3
+
+// Do the corrective addition and copy to output
+
+        addq    c, d0
+        movq    d0, (z)
+        adcq    n1, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_bigendian_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_bigendian_4.S
new file mode 100644
index 00000000000..01684e73773
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_bigendian_4.S
@@ -0,0 +1,88 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert 4-digit (256-bit) bignum to/from big-endian form
+// Input x[4]; output z[4]
+//
+//    extern void bignum_bigendian_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The same function is given two other prototypes whose names reflect the
+// treatment of one or other argument as a byte array rather than word array:
+//
+//    extern void bignum_frombebytes_4
+//     (uint64_t z[static 4], uint8_t x[static 32]);
+//
+//    extern void bignum_tobebytes_4
+//     (uint8_t z[static 32], uint64_t x[static 4]);
+//
+// Since x86 is little-endian, and bignums are stored with little-endian
+// word order, this is simply byte reversal and is implemented as such.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bigendian_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bigendian_4)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_frombebytes_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_frombebytes_4)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tobebytes_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tobebytes_4)
+
+        .text
+
+#define z %rdi
+#define x %rsi
+#define a %rax
+#define b %rdx
+
+// All loads and stores are word-sized, then we use BSWAP to
+// reverse the byte order, as well as switching round the word order
+// when writing back. The reads and writes are organized in mirror-image
+// pairs (0-3 and 1-2) to allow x and z to point to the same buffer
+// without using more intermediate registers.
+
+S2N_BN_SYMBOL(bignum_bigendian_4):
+S2N_BN_SYMBOL(bignum_frombebytes_4):
+S2N_BN_SYMBOL(bignum_tobebytes_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// 0 and 3 words
+
+        movq    (x), a
+        movq    24(x), b
+        bswapq  a
+        bswapq  b
+        movq    a, 24(z)
+        movq    b, (z)
+
+// 1 and 2 words
+
+        movq    8(x), a
+        movq    16(x), b
+        bswapq  a
+        bswapq  b
+        movq    a, 16(z)
+        movq    b, 8(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256.S
new file mode 100644
index 00000000000..19883af3c24
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256.S
@@ -0,0 +1,129 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_p256
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = c, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256)
+        .text
+
+#define z %rdi
+
+// Temporarily moved here for initial multiply
+#define x %rcx
+// Likewise this is thrown away after initial multiply
+#define m %rdx
+
+#define a %rax
+#define c %rcx
+
+#define d0 %rsi
+#define d1 %r8
+#define d2 %r9
+#define d3 %r10
+#define h %r11
+
+#define ashort %eax
+
+// Multiplier again for second stage
+#define q %rdx
+
+
+S2N_BN_SYMBOL(bignum_cmul_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Shuffle inputs (since we want multiplier in %rdx)
+
+        movq    %rdx, x
+        movq    %rsi, m
+
+// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0]
+
+        mulxq   (x), d0, d1
+        mulxq   8(x), a, d2
+        addq    a, d1
+        mulxq   16(x), a, d3
+        adcq    a, d2
+        mulxq   24(x), a, h
+        adcq    a, d3
+        adcq    $0, h
+
+// Writing the product as z = 2^256 * h + 2^192 * d3 + t = 2^192 * hl + t, our
+// intended quotient approximation is (hl + hl>>32 + 1)>>64. Note that by
+// hypothesis our product is <= (2^64 - 1) * (p_256 - 1), so there is no need
+// to max this out to avoid wrapping.
+
+        movq    h, a
+        shldq   $32, d3, a
+        movq    h, q
+        shrq    $32, q
+
+        xorq    c, c
+        subq    $1, c
+
+        adcq    d3, a
+        adcq    h, q
+
+// Now compute the initial pre-reduced result z - p_256 * q
+// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q
+// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q
+
+        addq    q, d0
+        movq    $0x0000000100000000, a
+        mulxq   a, a, c
+        sbbq    $0, a
+        sbbq    $0, c
+        subq    a, d1
+        sbbq    c, d2
+        movq    $0xffffffff00000001, a
+        mulxq   a, a, c
+        sbbq    a, d3
+        sbbq    c, h
+
+// Now our top word h is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative, as a bitmask
+// Do a masked addition of p_256 and write back
+
+        movl    $0x00000000ffffffff, ashort
+        andq    h, a
+        xorq    c, c
+        subq    a, c
+        addq    h, d0
+        movq    d0, (z)
+        adcq    a, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    c, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256_alt.S
new file mode 100644
index 00000000000..d68c947402a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256_alt.S
@@ -0,0 +1,146 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_p256_alt
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = c, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256_alt)
+        .text
+
+#define z %rdi
+
+// Temporarily moved here for initial multiply then thrown away
+
+#define x %rcx
+#define m %rsi
+
+// Other variables
+
+#define d %rdx
+#define a %rax
+#define c %rcx
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+#define h %rsi
+
+#define ashort %eax
+#define hshort %esi
+
+// Multiplier again for second stage
+
+#define q %rcx
+
+S2N_BN_SYMBOL(bignum_cmul_p256_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Shuffle inputs (since we want %rdx for the high parts of products)
+
+        movq    %rdx, x
+
+// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0]
+
+        movq    (x), a
+        mulq    m
+        movq    a, d0
+        movq    d, d1
+
+        movq    8(x), a
+        mulq    m
+        xorq    d2, d2
+        addq    a, d1
+        adcq    d, d2
+
+        movq    16(x), a
+        mulq    m
+        xorq    d3, d3
+        addq    a, d2
+        adcq    d, d3
+
+        movq    24(x), a
+        mulq    m
+        xorl    hshort, hshort
+        addq    a, d3
+        adcq    d, h
+
+// Writing the product as z = 2^256 * h + 2^192 * d3 + t = 2^192 * hl + t, our
+// intended quotient approximation is (hl + hl>>32 + 1)>>64. Note that by
+// hypothesis our product is <= (2^64 - 1) * (p_256 - 1), so there is no need
+// to max this out to avoid wrapping.
+
+        movq    h, a
+        shldq   $32, d3, a
+        movq    h, q
+        shrq    $32, q
+        xorq    d, d
+        subq    $1, d
+        adcq    d3, a
+        adcq    h, q
+
+// Now compute the initial pre-reduced result z - p_256 * q
+// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q
+// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q
+
+        movq    $0x0000000100000000, a
+        mulq    q
+        addq    q, d0
+        sbbq    $0, a
+        sbbq    $0, d
+        subq    a, d1
+        sbbq    d, d2
+        sbbq    $0, d3
+        sbbq    $0, h
+        movq    $0xffffffff00000001, a
+        mulq    q
+        subq    a, d3
+        sbbq    d, h
+
+// Now our top word h is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative, as a bitmask
+// Do a masked addition of p_256 and write back
+
+        movl    $0x00000000ffffffff, ashort
+        andq    h, a
+        xorq    c, c
+        subq    a, c
+        addq    h, d0
+        movq    d0, (z)
+        adcq    a, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    c, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256.S
new file mode 100644
index 00000000000..6c0e66ec23f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256.S
@@ -0,0 +1,145 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_deamont_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form,
+// "almost" meaning any 4-digit input will work, with no range restriction.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Re-use these as temporaries in the correction phase
+
+#define d %rdx
+#define u %r10
+#define v %r11
+
+#define dshort %edx
+#define ushort %r10d
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rcx as temporaries
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rcx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rcx, high
+
+S2N_BN_SYMBOL(bignum_deamont_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save one more register to play with
+
+        pushq   %rbx
+
+// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x
+
+        movq    (x), %r8
+        movq    8(x), %r9
+        movq    16(x), %r10
+        movq    24(x), %r11
+
+// Fill in two zeros to the left
+
+        xorq    %rbx, %rbx
+        xorq    %rsi, %rsi
+
+// Montgomery reduce windows 0 and 1 together
+
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%r10,%r9,%r8)
+        mulpadd(%r11,%r10,%r9)
+        movq    $0xffffffff00000001, %rdx
+        mulpadd(%rbx,%r11,%r8)
+        mulpadd(%rsi,%rbx,%r9)
+        movl    $0, %r8d
+        adcxq   %r8, %rsi
+
+// Append just one more leading zero (by the above %r8 = 0 already).
+
+        xorq    %r9, %r9
+
+// Montgomery reduce windows 2 and 3 together
+
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%rbx,%r11,%r10)
+        mulpadd(%rsi,%rbx,%r11)
+        movq    $0xffffffff00000001, %rdx
+        mulpadd(%r8,%rsi,%r10)
+        mulpadd(%r9,%r8,%r11)
+        movl    $0, %r10d
+        adcxq   %r10, %r9
+
+// We now have a pre-reduced dd = [%r9;%r8;%rsi;%rbx]. Load non-trivial digits
+// of p_256 = [v; 0; u; -1]
+
+        movl    $0x00000000ffffffff, ushort
+        movq    $0xffffffff00000001, v
+
+// Now do the subtraction (p_256-1) - (%r9;%r8;%rsi;%rbx) to get the carry
+
+        movq    $-2, d
+        subq    %rbx, d
+        movq    u, d
+        sbbq    %rsi, d
+        movl    $0, dshort
+        sbbq    %r8, d
+        movq    v, d
+        sbbq    %r9, d
+
+// Convert the carry CF <=> dd >= p_256 to a bitmask and do a masked subtraction
+
+        sbbq    d, d
+        andq    d, u
+        andq    d, v
+
+        subq    d, %rbx
+        sbbq    u, %rsi
+        sbbq    $0, %r8
+        sbbq    v, %r9
+
+// Write back
+
+        movq    %rbx, (z)
+        movq    %rsi, 8(z)
+        movq    %r8, 16(z)
+        movq    %r9, 24(z)
+
+// Restore saved register and return
+
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256_alt.S
new file mode 100644
index 00000000000..a02ce2f2fab
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256_alt.S
@@ -0,0 +1,158 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_deamont_p256_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form,
+// "almost" meaning any 4-digit input will work, with no range restriction.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Re-use these as temporaries in the correction phase
+
+#define d %rdx
+#define u %rax
+#define v %rcx
+
+#define dshort %edx
+#define ushort %eax
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rcx as temporaries
+
+#define mulpado(high,low,m)             \
+        mulxq   m, %rax, %rcx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rcx, high
+
+// Add %rcx * m into a register-pair (high,low) maintaining consistent
+// carry-catching with carry (negated, as bitmask) and using %rax and %rdx
+// as temporaries
+
+#define mulpadd(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rcx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// Initial version assuming no carry-in
+
+#define mulpadi(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rcx;                    \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// Version with no carry in or out
+
+#define mulpadn(high,low,m)             \
+        movq    m, %rax ;                 \
+        mulq    %rcx;                    \
+        addq    %rax, low ;               \
+        adcq    %rdx, high
+
+S2N_BN_SYMBOL(bignum_deamont_p256_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x
+
+        movq    (x), %r8
+        movq    8(x), %r9
+        movq    16(x), %r10
+        movq    24(x), %r11
+
+// Load constant 2^32; %rcx toggles between this and (1 - %rcx) below
+
+        movq    $0x0000000100000000, %rcx
+
+// Montgomery reduce windows 0 and 1 together as [%r8;%rsi;%r11;%r10]
+
+        mulpadi(%rsi,%r10,%r9,%r8)
+        mulpadd(%rsi,%r11,%r10,%r9)
+        negq    %rcx
+        negq    %rsi
+        incq    %rcx
+        mulpadi(%r8,%rsi,%r11,%r8)
+        negq    %r8
+        mulpadn(%r8,%rsi,%r9)
+
+// Montgomery reduce windows 2 and 3 together as [%r10;%r9;%r8;%rsi]
+
+        negq    %rcx
+        incq    %rcx
+        mulpadi(%r9,%rsi,%r11,%r10)
+        mulpadd(%r9,%r8,%rsi,%r11)
+        negq    %rcx
+        negq    %r9
+        incq    %rcx
+        mulpadi(%r10,%r9,%r8,%r10)
+        negq    %r10
+        mulpadn(%r10,%r9,%r11)
+
+// We now have a pre-reduced result z = [%r10;%r9;%r8;%rsi].
+// From the above we have %rcx = 0xffffffff00000001, which we use to generate
+// [0x00000000fffffffe; -1; 0xffffffff00000000; 1] = 2^256 - p_256 and
+// then compute [%rcx;%rdx;%r11;%rax] = z + (2^256 - p_256)
+
+        xorl    %edx, %edx
+        leaq    1(%rdx), %rax
+        addq    %rsi, %rax
+        leaq    -1(%rcx), %r11
+        adcq    %r8, %r11
+        notq    %rdx
+        adcq    %r9, %rdx
+        notq    %rcx
+        adcq    %r10, %rcx
+
+// CF is set iff z + (2^256 - p_256) >= 2^256, i.e. if z >= p_256.
+// If so we want the result of the subtraction (in 4 words)
+
+        cmovcq  %rax, %rsi
+        cmovcq  %r11, %r8
+        cmovcq  %rdx, %r9
+        cmovcq  %rcx, %r10
+
+// Write back
+
+        movq    %rsi, (z)
+        movq    %r8, 8(z)
+        movq    %r9, 16(z)
+        movq    %r10, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256.S
new file mode 100644
index 00000000000..d9bc8c66e06
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256.S
@@ -0,0 +1,112 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_demont_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// This assumes the input is < p_256 for correctness. If this is not the case,
+// use the variant "bignum_deamont_p256" instead.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rcx as temporaries
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rcx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rcx, high
+
+S2N_BN_SYMBOL(bignum_demont_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save one more register to play with
+
+        pushq   %rbx
+
+// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x
+
+        movq    (x), %r8
+        movq    8(x), %r9
+        movq    16(x), %r10
+        movq    24(x), %r11
+
+// Fill in two zeros to the left
+
+        xorq    %rbx, %rbx
+        xorq    %rsi, %rsi
+
+// Montgomery reduce windows 0 and 1 together
+
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%r10,%r9,%r8)
+        mulpadd(%r11,%r10,%r9)
+        movq    $0xffffffff00000001, %rdx
+        mulpadd(%rbx,%r11,%r8)
+        mulpadd(%rsi,%rbx,%r9)
+        movl    $0, %r8d
+        adcxq   %r8, %rsi
+
+// Append just one more leading zero (by the above %r8 = 0 already).
+
+        xorq    %r9, %r9
+
+// Montgomery reduce windows 2 and 3 together
+
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%rbx,%r11,%r10)
+        mulpadd(%rsi,%rbx,%r11)
+        movq    $0xffffffff00000001, %rdx
+        mulpadd(%r8,%rsi,%r10)
+        mulpadd(%r9,%r8,%r11)
+        movl    $0, %r10d
+        adcxq   %r10, %r9
+
+// Since the input was assumed reduced modulo, i.e. < p, we actually know that
+// 2^256 * [carries; %r9;%r8;%rsi;%rbx] is <= (p - 1) + (2^256 - 1) p
+// and hence [carries; %r9;%r8;%rsi;%rbx] < p. This means in fact carries = 0
+// and [%r9;%r8;%rsi;%rbx] is already our answer, without further correction.
+// Write that back.
+
+        movq    %rbx, (z)
+        movq    %rsi, 8(z)
+        movq    %r8, 16(z)
+        movq    %r9, 24(z)
+
+// Restore saved register and return
+
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256_alt.S
new file mode 100644
index 00000000000..f53228cb9fd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256_alt.S
@@ -0,0 +1,130 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_demont_p256_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// This assumes the input is < p_256 for correctness. If this is not the case,
+// use the variant "bignum_deamont_p256" instead.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rcx as temporaries
+
+#define mulpado(high,low,m)             \
+        mulxq   m, %rax, %rcx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rcx, high
+
+// Add %rcx * m into a register-pair (high,low) maintaining consistent
+// carry-catching with carry (negated, as bitmask) and using %rax and %rdx
+// as temporaries
+
+#define mulpadd(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rcx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// Initial version assuming no carry-in
+
+#define mulpadi(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rcx;                    \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// Version with no carry in or out
+
+#define mulpadn(high,low,m)             \
+        movq    m, %rax ;                 \
+        mulq    %rcx;                    \
+        addq    %rax, low ;               \
+        adcq    %rdx, high
+
+S2N_BN_SYMBOL(bignum_demont_p256_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x
+
+        movq    (x), %r8
+        movq    8(x), %r9
+        movq    16(x), %r10
+        movq    24(x), %r11
+
+// Load constant 2^32; %rcx toggles between this and (1 - %rcx) below
+
+        movq    $0x0000000100000000, %rcx
+
+// Montgomery reduce windows 0 and 1 together as [%r8;%rsi;%r11;%r10]
+
+        mulpadi(%rsi,%r10,%r9,%r8)
+        mulpadd(%rsi,%r11,%r10,%r9)
+        negq    %rcx
+        negq    %rsi
+        incq    %rcx
+        mulpadi(%r8,%rsi,%r11,%r8)
+        negq    %r8
+        mulpadn(%r8,%rsi,%r9)
+
+// Montgomery reduce windows 2 and 3 together as [%r10;%r9;%r8;%rsi]
+
+        negq    %rcx
+        incq    %rcx
+        mulpadi(%r9,%rsi,%r11,%r10)
+        mulpadd(%r9,%r8,%rsi,%r11)
+        negq    %rcx
+        negq    %r9
+        incq    %rcx
+        mulpadi(%r10,%r9,%r8,%r10)
+        negq    %r10
+        mulpadn(%r10,%r9,%r11)
+
+// Since the input was assumed reduced modulo, i.e. < p, we actually know that
+// 2^256 * [carries; %r10;%r9;%r8;%rsi] is <= (p - 1) + (2^256 - 1) p
+// and hence [carries; %r10;%r9;%r8;%rsi] < p. This means in fact carries = 0
+// and [%r10;%r9;%r8;%rsi] is already our answer, without further correction.
+// Write that back.
+
+        movq    %rsi, (z)
+        movq    %r8, 8(z)
+        movq    %r9, 16(z)
+        movq    %r10, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_double_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_double_p256.S
new file mode 100644
index 00000000000..8c1b2cf2959
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_double_p256.S
@@ -0,0 +1,99 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_double_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %r11
+#define c %rax
+
+#define n1short %r10d
+
+
+
+S2N_BN_SYMBOL(bignum_double_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the input and double it so that 2^256 * c + [d3;d2;d1;d0] = 2 * x
+// Could also consider using shld to decouple carries
+
+        xorq    c, c
+        movq    (x), d0
+        addq    d0, d0
+        movq    8(x), d1
+        adcq    d1, d1
+        movq    16(x), d2
+        adcq    d2, d2
+        movq    24(x), d3
+        adcq    d3, d3
+        adcq    c, c
+
+// Now subtract 2^256 * c + [d3;d3;d1;d1] = 2 * x - p_256
+// The constants n1 and n3 in [n3; 0; n1; -1] = p_256 are saved for later
+
+        subq    $-1, d0
+        movl    $0x00000000ffffffff, n1short
+        sbbq    n1, d1
+        sbbq    $0, d2
+        movq    $0xffffffff00000001, n3
+        sbbq    n3, d3
+
+// Since by hypothesis x < p_256 we know 2 * x - p_256 < 2^256, so the top
+// carry c actually gives us a bitmask for 2 * x - p_256 < 0, which we
+// now use to make a masked p_256' = [n3; 0; n1; c]
+
+        sbbq    $0, c
+        andq    c, n1
+        andq    c, n3
+
+// Do the corrective addition and copy to output
+
+        addq    c, d0
+        movq    d0, (z)
+        adcq    n1, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_half_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_half_p256.S
new file mode 100644
index 00000000000..2c2da0f9e27
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_half_p256.S
@@ -0,0 +1,91 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_half_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define a %rax
+#define d0 %rcx
+#define d1 %rdx
+#define d2 %r8
+#define d3 %r9
+
+#define d0short %ecx
+#define d1short %edx
+
+
+
+S2N_BN_SYMBOL(bignum_half_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load lowest digit and get a mask for its lowest bit in d0
+
+        movq    (x), a
+        movl    $1, d0short
+        andq    a, d0
+        negq    d0
+
+// Create a masked version of p_256
+
+        movl    $0x00000000ffffffff, d1short
+        xorq    d3, d3
+        andq    d0, d1
+        subq    d1, d3
+        xorq    d2, d2
+
+// Perform addition with masked p_256. Catch the carry in a, as a bitmask
+// for convenience though we only use its LSB below with SHRD
+
+        addq    a, d0
+        adcq    8(x), d1
+        adcq    16(x), d2
+        adcq    24(x), d3
+        sbbq    a, a
+
+// Shift right, pushing the carry back down, and store back
+
+        shrdq   $1, d1, d0
+        movq    d0, (z)
+        shrdq   $1, d2, d1
+        movq    d1, 8(z)
+        shrdq   $1, d3, d2
+        movq    d2, 16(z)
+        shrdq   $1, a, d3
+        movq    d3, 24(z)
+
+// Return
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_inv_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_inv_p256.S
new file mode 100644
index 00000000000..c75ef212679
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_inv_p256.S
@@ -0,0 +1,1623 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1
+// Input x[4]; output z[4]
+//
+// extern void bignum_inv_p256(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// If the 4-digit input x is coprime to p_256, i.e. is not divisible
+// by it, returns z < p_256 such that x * z == 1 (mod p_256). Note that
+// x does not need to be reduced modulo p_256, but the output always is.
+// If the input is divisible (i.e. is 0 or p_256), then there can be no
+// modular inverse and z = 0 is returned.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p256)
+        .text
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f 0(%rsp)
+#define g (5*N)(%rsp)
+#define u (10*N)(%rsp)
+#define v (15*N)(%rsp)
+#define tmp  (20*N)(%rsp)
+#define tmp2  (21*N)(%rsp)
+#define i  (22*N)(%rsp)
+#define d  (23*N)(%rsp)
+
+#define mat (24*N)(%rsp)
+
+// Backup for the input pointer
+
+#define res  (28*N)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (30*N)
+
+// Syntactic variants to make x86_att version simpler to generate
+
+#define F 0
+#define G (5*N)
+#define U (10*N)
+#define V (15*N)
+#define MAT (24*N)
+
+#define ff  (%rsp)
+#define gg  (5*N)(%rsp)
+
+// ---------------------------------------------------------------------------
+// Core signed almost-Montgomery reduction macro from u[4..0] to u[3..0].
+// ---------------------------------------------------------------------------
+
+#define amontred(P)                                                     \
+/* We only know the input is -2^316 < x < 2^316. To do traditional  */  \
+/* unsigned Montgomery reduction, start by adding 2^61 * p_256.     */  \
+        movq    $0xe000000000000000, %r8 ;                                 \
+        addq    P, %r8 ;                                                \
+        movq    $0xffffffffffffffff, %r9 ;                                 \
+        adcq    8+P, %r9 ;                                              \
+        movq    $0x000000001fffffff, %r10 ;                                \
+        adcq    16+P, %r10 ;                                            \
+        movq    $0x2000000000000000, %r11 ;                                \
+        adcq    24+P, %r11 ;                                            \
+        movq    $0x1fffffffe0000000, %r12 ;                                \
+        adcq    32+P, %r12 ;                                            \
+/* Let [%r8;%rbx] = 2^32 * w and [%rdx;%rax] = (2^64 - 2^32 + 1) * w */     \
+/* where w is the lowest word */                                        \
+        movq    %r8, %rbx ;                                                \
+        shlq    $32, %rbx ;                                                \
+        movq    $0xffffffff00000001, %rax ;                                \
+        mulq    %r8;                                                     \
+        shrq    $32, %r8 ;                                                 \
+/* Hence basic addition of (2^256 - 2^224 + 2^192 + 2^96) * w */        \
+        addq    %rbx, %r9 ;                                                \
+        adcq    %r8, %r10 ;                                                \
+        adcq    %rax, %r11 ;                                               \
+        adcq    %rdx, %r12 ;                                               \
+/* Now capture carry and subtract p_256 if set (almost-Montgomery) */   \
+        sbbq    %rax, %rax ;                                               \
+        movl    $0x00000000ffffffff, %ebx ;                                \
+        andq    %rax, %rbx ;                                               \
+        movq    $0xffffffff00000001, %rdx ;                                \
+        andq    %rax, %rdx ;                                               \
+        subq    %rax, %r9 ;                                                \
+        movq    %r9, P ;                                                \
+        sbbq    %rbx, %r10 ;                                               \
+        movq    %r10, 8+P ;                                             \
+        sbbq    $0, %r11 ;                                                 \
+        movq    %r11, 16+P ;                                            \
+        sbbq    %rdx, %r12 ;                                               \
+        movq    %r12, 24+P
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix as
+//
+// [ %r8   %r10]
+// [ %r12  %r14]
+//
+// and also returning the matrix still negated (which doesn't matter)
+
+#define divstep59(din,fin,gin)                                          \
+        movq    din, %rsi ;                                               \
+        movq    fin, %rdx ;                                               \
+        movq    gin, %rcx ;                                               \
+        movq    %rdx, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        xorl    %ebp, %ebp ;                                               \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %rdx ;                                         \
+        leaq    (%rcx,%rax), %rdi ;                                         \
+        shlq    $0x16, %rdx ;                                              \
+        shlq    $0x16, %rdi ;                                              \
+        sarq    $0x2b, %rdx ;                                              \
+        sarq    $0x2b, %rdi ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %rbx ;                                         \
+        leaq    (%rcx,%rax), %rcx ;                                         \
+        sarq    $0x2a, %rbx ;                                              \
+        sarq    $0x2a, %rcx ;                                              \
+        movq    %rdx, MAT(%rsp) ;                                         \
+        movq    %rbx, MAT+0x8(%rsp) ;                                     \
+        movq    %rdi, MAT+0x10(%rsp) ;                                    \
+        movq    %rcx, MAT+0x18(%rsp) ;                                    \
+        movq    fin, %r12 ;                                               \
+        imulq   %r12, %rdi ;                                               \
+        imulq   %rdx, %r12 ;                                               \
+        movq    gin, %r13 ;                                               \
+        imulq   %r13, %rbx ;                                               \
+        imulq   %rcx, %r13 ;                                               \
+        addq    %rbx, %r12 ;                                               \
+        addq    %rdi, %r13 ;                                               \
+        sarq    $0x14, %r12 ;                                              \
+        sarq    $0x14, %r13 ;                                              \
+        movq    %r12, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        movq    %r13, %rcx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %r8 ;                                          \
+        leaq    (%rcx,%rax), %r10 ;                                         \
+        shlq    $0x16, %r8 ;                                               \
+        shlq    $0x16, %r10 ;                                              \
+        sarq    $0x2b, %r8 ;                                               \
+        sarq    $0x2b, %r10 ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %r15 ;                                         \
+        leaq    (%rcx,%rax), %r11 ;                                         \
+        sarq    $0x2a, %r15 ;                                              \
+        sarq    $0x2a, %r11 ;                                              \
+        movq    %r13, %rbx ;                                               \
+        movq    %r12, %rcx ;                                               \
+        imulq   %r8, %r12 ;                                                \
+        imulq   %r15, %rbx ;                                               \
+        addq    %rbx, %r12 ;                                               \
+        imulq   %r11, %r13 ;                                               \
+        imulq   %r10, %rcx ;                                               \
+        addq    %rcx, %r13 ;                                               \
+        sarq    $0x14, %r12 ;                                              \
+        sarq    $0x14, %r13 ;                                              \
+        movq    %r12, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        movq    %r13, %rcx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    MAT(%rsp), %rax ;                                         \
+        imulq   %r8, %rax ;                                                \
+        movq    MAT+0x10(%rsp), %rdx ;                                    \
+        imulq   %r15, %rdx ;                                               \
+        imulq   MAT+0x8(%rsp), %r8 ;                                      \
+        imulq   MAT+0x18(%rsp), %r15 ;                                    \
+        addq    %r8, %r15 ;                                                \
+        leaq    (%rax,%rdx), %r9 ;                                          \
+        movq    MAT(%rsp), %rax ;                                         \
+        imulq   %r10, %rax ;                                               \
+        movq    MAT+0x10(%rsp), %rdx ;                                    \
+        imulq   %r11, %rdx ;                                               \
+        imulq   MAT+0x8(%rsp), %r10 ;                                     \
+        imulq   MAT+0x18(%rsp), %r11 ;                                    \
+        addq    %r10, %r11 ;                                               \
+        leaq    (%rax,%rdx), %r13 ;                                         \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %r8 ;                                          \
+        leaq    (%rcx,%rax), %r12 ;                                         \
+        shlq    $0x15, %r8 ;                                               \
+        shlq    $0x15, %r12 ;                                              \
+        sarq    $0x2b, %r8 ;                                               \
+        sarq    $0x2b, %r12 ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %r10 ;                                         \
+        leaq    (%rcx,%rax), %r14 ;                                         \
+        sarq    $0x2b, %r10 ;                                              \
+        sarq    $0x2b, %r14 ;                                              \
+        movq    %r9, %rax ;                                                \
+        imulq   %r8, %rax ;                                                \
+        movq    %r13, %rdx ;                                               \
+        imulq   %r10, %rdx ;                                               \
+        imulq   %r15, %r8 ;                                                \
+        imulq   %r11, %r10 ;                                               \
+        addq    %r8, %r10 ;                                                \
+        leaq    (%rax,%rdx), %r8 ;                                          \
+        movq    %r9, %rax ;                                                \
+        imulq   %r12, %rax ;                                               \
+        movq    %r13, %rdx ;                                               \
+        imulq   %r14, %rdx ;                                               \
+        imulq   %r15, %r12 ;                                               \
+        imulq   %r11, %r14 ;                                               \
+        addq    %r12, %r14 ;                                               \
+        leaq    (%rax,%rdx), %r12
+
+S2N_BN_SYMBOL(bignum_inv_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room for temporaries
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Save the return pointer for the end so we can overwrite %rdi later
+
+        movq    %rdi, res
+
+// Create constant [%rdx;%rcx;%rbx;%rax] = p_256 and copy it into the variable f
+// including the 5th zero digit
+
+        xorl    %ecx, %ecx
+        movl    $0x00000000ffffffff, %edx
+        movq    %rdx, %rbx
+        leaq    -1(%rcx), %rax
+        negq    %rdx
+        movq    %rax, F(%rsp)
+        movq    %rbx, F+8(%rsp)
+        movq    %rcx, F+16(%rsp)
+        movq    %rdx, F+24(%rsp)
+        movq    %rcx, F+32(%rsp)
+
+// Now reduce the input modulo p_256, first negating the constant to get
+// [%rdx;%rcx;%rbx;%rax] = 2^256 - p_256, adding it to x and hence getting
+// the comparison x < p_256 <=> (2^256 - p_256) + x < 2^256 and choosing
+// g accordingly.
+
+        movq    (%rsi), %r8
+        movq    8(%rsi), %r9
+        movq    16(%rsi), %r10
+        movq    24(%rsi), %r11
+
+        leaq    1(%rcx), %rax
+        addq    %r8, %rax
+        leaq    -1(%rdx), %rbx
+        adcq    %r9, %rbx
+        notq    %rcx
+        adcq    %r10, %rcx
+        notq    %rdx
+        adcq    %r11, %rdx
+
+        cmovncq %r8, %rax
+        cmovncq %r9, %rbx
+        cmovncq %r10, %rcx
+        cmovncq %r11, %rdx
+
+        movq    %rax, G(%rsp)
+        movq    %rbx, G+8(%rsp)
+        movq    %rcx, G+16(%rsp)
+        movq    %rdx, G+24(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, G+32(%rsp)
+
+// Also maintain reduced < 2^256 vector [u,v] such that
+// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_256)
+// starting with [p_256,x] == x * 2^{5*0-50} * [0,2^50] (mod p_256)
+// The weird-looking 5*i modifications come in because we are doing
+// 64-bit word-sized Montgomery reductions at each stage, which is
+// 5 bits more than the 59-bit requirement to keep things stable.
+
+        xorl    %eax, %eax
+        movq    %rax, U(%rsp)
+        movq    %rax, U+8(%rsp)
+        movq    %rax, U+16(%rsp)
+        movq    %rax, U+24(%rsp)
+
+        movq    $0x0004000000000000, %rcx
+        movq    %rcx, V(%rsp)
+        movq    %rax, V+8(%rsp)
+        movq    %rax, V+16(%rsp)
+        movq    %rax, V+24(%rsp)
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special tenth iteration after a uniform
+// first 9.
+
+        movq    $10, i
+        movq    $1, d
+        jmp     bignum_inv_p256_midloop
+
+bignum_inv_p256_loop:
+
+// Separate out the matrix into sign-magnitude pairs
+
+        movq    %r8, %r9
+        sarq    $63, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+
+        movq    %r10, %r11
+        sarq    $63, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+
+        movq    %r12, %r13
+        sarq    $63, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+
+        movq    %r14, %r15
+        sarq    $63, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in temporary storage for the [u,v] part and do [f,g] first.
+
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %rdi
+        andq    %r11, %rdi
+        addq    %rax, %rdi
+        movq    %rdi, tmp
+
+        movq    %r12, %rax
+        andq    %r13, %rax
+        movq    %r14, %rsi
+        andq    %r15, %rsi
+        addq    %rax, %rsi
+        movq    %rsi, tmp2
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        xorl    %ebx, %ebx
+        movq    F(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    G(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+
+        xorl    %ebp, %ebp
+        movq    F(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    G(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+
+// Digit 1 of [f,g]
+
+        xorl    %ecx, %ecx
+        movq    F+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    G+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        shrdq   $59, %rbx, %rdi
+        movq    %rdi, F(%rsp)
+
+        xorl    %edi, %edi
+        movq    F+N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        movq    G+N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        shrdq   $59, %rbp, %rsi
+        movq    %rsi, G(%rsp)
+
+// Digit 2 of [f,g]
+
+        xorl    %esi, %esi
+        movq    F+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        movq    G+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        shrdq   $59, %rcx, %rbx
+        movq    %rbx, F+N(%rsp)
+
+        xorl    %ebx, %ebx
+        movq    F+2*N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    G+2*N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        shrdq   $59, %rdi, %rbp
+        movq    %rbp, G+N(%rsp)
+
+// Digits 3 and 4 of [f,g]
+
+        movq    F+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        movq    F+4*N(%rsp), %rbp
+        xorq    %r9, %rbp
+        andq    %r8, %rbp
+        negq    %rbp
+        mulq    %r8
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    G+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    G+4*N(%rsp), %rdx
+        xorq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbp
+        mulq    %r10
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        shrdq   $59, %rsi, %rcx
+        movq    %rcx, F+2*N(%rsp)
+        shrdq   $59, %rbp, %rsi
+        sarq    $59, %rbp
+
+        movq    F+3*N(%rsp), %rax
+        movq    %rsi, F+3*N(%rsp)
+
+        movq    F+4*N(%rsp), %rsi
+        movq    %rbp, F+4*N(%rsp)
+
+        xorq    %r13, %rax
+        xorq    %r13, %rsi
+        andq    %r12, %rsi
+        negq    %rsi
+        mulq    %r12
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        movq    G+3*N(%rsp), %rax
+        xorq    %r15, %rax
+        movq    G+4*N(%rsp), %rdx
+        xorq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rsi
+        mulq    %r14
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        shrdq   $59, %rbx, %rdi
+        movq    %rdi, G+2*N(%rsp)
+        shrdq   $59, %rsi, %rbx
+        movq    %rbx, G+3*N(%rsp)
+        sarq    $59, %rsi
+        movq    %rsi, G+4*N(%rsp)
+
+// Get the initial carries back from storage and do the [u,v] accumulation
+
+        movq    tmp, %rbx
+        movq    tmp2, %rbp
+
+// Digit 0 of [u,v]
+
+        xorl    %ecx, %ecx
+        movq    U(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    V(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+
+        xorl    %esi, %esi
+        movq    U(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, U(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    V(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, V(%rsp)
+
+// Digit 1 of [u,v]
+
+        xorl    %ebx, %ebx
+        movq    U+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    V+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+
+        xorl    %ebp, %ebp
+        movq    U+N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rcx, U+N(%rsp)
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    V+N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    %rsi, V+N(%rsp)
+
+// Digit 2 of [u,v]
+
+        xorl    %ecx, %ecx
+        movq    U+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    V+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+
+        xorl    %esi, %esi
+        movq    U+2*N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, U+2*N(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    V+2*N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, V+2*N(%rsp)
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        movq    U+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        movq    %r9, %rbx
+        andq    %r8, %rbx
+        negq    %rbx
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    V+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbx
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rbx, %rdx
+
+// Preload for last use of old u digit 3
+
+        movq    U+3*N(%rsp), %rax
+        movq    %rcx, U+3*N(%rsp)
+        movq    %rdx, U+4*N(%rsp)
+
+// Digits 3 and 4 of v (top is unsigned)
+
+        xorq    %r13, %rax
+        movq    %r13, %rcx
+        andq    %r12, %rcx
+        negq    %rcx
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rcx
+        movq    V+3*N(%rsp), %rax
+        xorq    %r15, %rax
+        movq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rcx
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rcx, %rdx
+        movq    %rsi, V+3*N(%rsp)
+        movq    %rdx, V+4*N(%rsp)
+
+// Montgomery reduction of u
+
+        amontred(u)
+
+// Montgomery reduction of v
+
+        amontred(v)
+
+bignum_inv_p256_midloop:
+
+        divstep59(d,ff,gg)
+        movq    %rsi, d
+
+// Next iteration
+
+        decq    i
+        jnz     bignum_inv_p256_loop
+
+// The 10th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        movq    F(%rsp), %rax
+        movq    G(%rsp), %rcx
+        imulq   %r8, %rax
+        imulq   %r10, %rcx
+        addq    %rcx, %rax
+        sarq    $63, %rax
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * [u,v] (mod p_256)
+// we want to flip the sign of u according to that of f.
+
+        movq    %r8, %r9
+        sarq    $63, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        xorq    %rax, %r9
+
+        movq    %r10, %r11
+        sarq    $63, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        xorq    %rax, %r11
+
+        movq    %r12, %r13
+        sarq    $63, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        xorq    %rax, %r13
+
+        movq    %r14, %r15
+        sarq    $63, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        xorq    %rax, %r15
+
+// Adjust the initial value to allow for complement instead of negation
+
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %r12
+        andq    %r11, %r12
+        addq    %rax, %r12
+
+// Digit 0 of [u]
+
+        xorl    %r13d, %r13d
+        movq    U(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        movq    V(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+
+// Digit 1 of [u]
+
+        xorl    %r14d, %r14d
+        movq    U+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        movq    V+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+
+// Digit 2 of [u]
+
+        xorl    %r15d, %r15d
+        movq    U+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    V+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        movq    U+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        andq    %r8, %r9
+        negq    %r9
+        mulq    %r8
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    V+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %r9
+        mulq    %r10
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+
+// Store back and Montgomery reduce u
+
+        movq    %r12, U(%rsp)
+        movq    %r13, U+N(%rsp)
+        movq    %r14, U+2*N(%rsp)
+        movq    %r15, U+3*N(%rsp)
+        movq    %r9, U+4*N(%rsp)
+
+        amontred(u)
+
+// Perform final strict reduction mod p_256 and copy to output
+
+        movq    U(%rsp), %r8
+        movq    U+N(%rsp), %r9
+        movq    U+2*N(%rsp), %r10
+        movq    U+3*N(%rsp), %r11
+
+        movl    $1, %eax
+        movl    $0xffffffff, %ebx
+        leaq    -2(%rax), %rcx
+        leaq    -1(%rbx), %rdx
+        notq    %rbx
+
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+
+        cmovncq %r8, %rax
+        cmovncq %r9, %rbx
+        cmovncq %r10, %rcx
+        cmovncq %r11, %rdx
+
+        movq    res, %rdi
+        movq    %rax, (%rdi)
+        movq    %rbx, N(%rdi)
+        movq    %rcx, 2*N(%rdi)
+        movq    %rdx, 3*N(%rdi)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_littleendian_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_littleendian_4.S
new file mode 100644
index 00000000000..e378441427a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_littleendian_4.S
@@ -0,0 +1,74 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert 4-digit (256-bit) bignum to/from little-endian form
+// Input x[4]; output z[4]
+//
+//    extern void bignum_littleendian_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The same function is given two other prototypes whose names reflect the
+// treatment of one or other argument as a byte array rather than word array:
+//
+//    extern void bignum_fromlebytes_4
+//     (uint64_t z[static 4], uint8_t x[static 32]);
+//
+//    extern void bignum_tolebytes_4
+//     (uint8_t z[static 32], uint64_t x[static 4]);
+//
+// Since x86 is little-endian, this is just copying.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_littleendian_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_littleendian_4)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_fromlebytes_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_fromlebytes_4)
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tolebytes_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tolebytes_4)
+
+        .text
+
+#define z %rdi
+#define x %rsi
+#define a %rax
+
+S2N_BN_SYMBOL(bignum_littleendian_4):
+S2N_BN_SYMBOL(bignum_fromlebytes_4):
+S2N_BN_SYMBOL(bignum_tolebytes_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+        movq    (x), a
+        movq    a, (z)
+
+        movq    8(x), a
+        movq    a, 8(z)
+
+        movq    16(x), a
+        movq    a, 16(z)
+
+        movq    24(x), a
+        movq    a, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256.S
new file mode 100644
index 00000000000..2ada6c29bbb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256.S
@@ -0,0 +1,205 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_256
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_n256
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Reduction is modulo the group order of the NIST curve P-256.
+//
+// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = k, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256)
+        .text
+
+#define z %rdi
+#define k %rsi
+#define x %rcx
+
+#define m0 %r8
+#define m1 %r9
+#define m2 %r10
+#define m3 %r11
+#define d %r12
+
+#define n0 %rax
+#define n1 %rbx
+#define n3 %rdx
+#define q %rdx
+
+#define n0short %eax
+#define n3short %edx
+
+
+S2N_BN_SYMBOL(bignum_mod_n256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save extra registers
+
+        pushq   %rbx
+        pushq   %r12
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmpq    $4, k
+        jc      bignum_mod_n256_shortinput
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        subq    $4, k
+        movq    24(%rdx,k,8), m3
+        movq    16(%rdx,k,8), m2
+        movq    8(%rdx,k,8), m1
+        movq    (%rdx,k,8), m0
+
+// Move x into another register to leave %rdx free for multiplies and use of n3
+
+        movq    %rdx, x
+
+// Reduce the top 4 digits mod n_256 (a conditional subtraction of n_256)
+
+        movq    $0x0c46353d039cdaaf, n0
+        movq    $0x4319055258e8617b, n1
+        movl    $0x00000000ffffffff, n3short
+
+        addq    n0, m0
+        adcq    n1, m1
+        adcq    $0, m2
+        adcq    n3, m3
+        sbbq    d, d
+        notq    d
+        andq    d, n0
+        andq    d, n1
+        andq    d, n3
+        subq    n0, m0
+        sbbq    n1, m1
+        sbbq    $0, m2
+        sbbq    n3, m3
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        testq   k, k
+        jz      bignum_mod_n256_writeback
+
+bignum_mod_n256_loop:
+
+// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our
+// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1).
+
+        movq    m3, n0
+        shldq   $32, m2, n0
+        movq    m3, q
+        shrq    $32, q
+
+        xorq    n1, n1
+        subq    $1, n1
+
+        adcq    m2, n0
+        adcq    m3, q
+        sbbq    n0, n0
+        orq     n0, q
+
+// Load the next digit so current m to reduce = [m3;m2;m1;m0;d]
+
+        movq    -8(x,k,8), d
+
+// Now form [m3;m2;m1;m0;d] = m - q * n_256
+
+        subq    q, m3
+        movq    $0x0c46353d039cdaaf, n0
+        mulxq   n0, n0, n1
+        addq    n0, d
+        adcq    n1, m0
+        movq    $0x4319055258e8617b, n0
+        mulxq   n0, n0, n1
+        adcq    $0, n1
+        addq    n0, m0
+        adcq    n1, m1
+        movl    $0x00000000ffffffff, n0short
+        mulxq   n0, n0, n1
+        adcq    n0, m2
+        adcq    n1, m3
+
+// Now our top word m3 is either zero or all 1s. Use it for a masked
+// addition of n_256, which we can do by a *subtraction* of
+// 2^256 - n_256 from our portion
+
+        movq    $0x0c46353d039cdaaf, n0
+        andq    m3, n0
+        movq    $0x4319055258e8617b, n1
+        andq    m3, n1
+        movl    $0x00000000ffffffff, n3short
+        andq    m3, n3
+
+        subq    n0, d
+        sbbq    n1, m0
+        sbbq    $0, m1
+        sbbq    n3, m2
+
+// Now shuffle registers up and loop
+
+        movq    m2, m3
+        movq    m1, m2
+        movq    m0, m1
+        movq    d, m0
+
+        decq    k
+        jnz     bignum_mod_n256_loop
+
+// Write back
+
+bignum_mod_n256_writeback:
+
+        movq    m0, (z)
+        movq    m1, 8(z)
+        movq    m2, 16(z)
+        movq    m3, 24(z)
+
+// Restore registers and return
+
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+bignum_mod_n256_shortinput:
+
+        xorq    m0, m0
+        xorq    m1, m1
+        xorq    m2, m2
+        xorq    m3, m3
+
+        testq   k, k
+        jz      bignum_mod_n256_writeback
+        movq    (%rdx), m0
+        decq    k
+        jz      bignum_mod_n256_writeback
+        movq    8(%rdx), m1
+        decq    k
+        jz      bignum_mod_n256_writeback
+        movq    16(%rdx), m2
+        jmp     bignum_mod_n256_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_4.S
new file mode 100644
index 00000000000..2c29a4ca607
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_4.S
@@ -0,0 +1,100 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_n256_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the group order of the NIST curve P-256.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256_4)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n0 %rax
+#define n1 %r10
+#define n3 %r11
+
+#define n3short %r11d
+
+// Can re-use this as a temporary once we've loaded the input
+
+#define c %rsi
+
+S2N_BN_SYMBOL(bignum_mod_n256_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load a set of registers [n3; 0; n1; n0] = 2^256 - n_256
+
+        movq    $0x0c46353d039cdaaf, n0
+        movq    $0x4319055258e8617b, n1
+        movl    $0x00000000ffffffff, n3short
+
+// Load the input and compute x + (2^256 - n_256)
+
+        movq    (x), d0
+        addq    n0, d0
+        movq    8(x), d1
+        adcq    n1, d1
+        movq    16(x), d2
+        adcq    $0, d2
+        movq    24(x), d3
+        adcq    n3, d3
+
+// Now CF is set iff 2^256 <= x + (2^256 - n_256), i.e. iff n_256 <= x.
+// Create a mask for the condition x < n, and mask the three nontrivial digits
+// ready to undo the previous addition with a compensating subtraction
+
+        sbbq    c, c
+        notq    c
+        andq    c, n0
+        andq    c, n1
+        andq    c, n3
+
+// Now subtract mask * (2^256 - n_256) again and store
+
+        subq    n0, d0
+        movq    d0, (z)
+        sbbq    n1, d1
+        movq    d1, 8(z)
+        sbbq    $0, d2
+        movq    d2, 16(z)
+        sbbq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_alt.S
new file mode 100644
index 00000000000..a3ef32f51fd
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_alt.S
@@ -0,0 +1,213 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_256
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_n256_alt
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Reduction is modulo the group order of the NIST curve P-256.
+//
+// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = k, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256_alt)
+        .text
+
+#define z %rdi
+#define k %rsi
+#define x %rcx
+
+#define m0 %r8
+#define m1 %r9
+#define m2 %r10
+#define m3 %r11
+#define d %r12
+
+#define n0 %rax
+#define n1 %rbx
+#define n3 %rdx
+
+#define q %rbx
+
+#define n0short %eax
+#define n3short %edx
+
+
+S2N_BN_SYMBOL(bignum_mod_n256_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save extra registers
+
+        pushq   %rbx
+        pushq   %r12
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmpq    $4, k
+        jc      bignum_mod_n256_alt_shortinput
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        subq    $4, k
+        movq    24(%rdx,k,8), m3
+        movq    16(%rdx,k,8), m2
+        movq    8(%rdx,k,8), m1
+        movq    (%rdx,k,8), m0
+
+// Move x into another register to leave %rdx free for multiplies and use of n3
+
+        movq    %rdx, x
+
+// Reduce the top 4 digits mod n_256 (a conditional subtraction of n_256)
+
+        movq    $0x0c46353d039cdaaf, n0
+        movq    $0x4319055258e8617b, n1
+        movl    $0x00000000ffffffff, n3short
+
+        addq    n0, m0
+        adcq    n1, m1
+        adcq    $0, m2
+        adcq    n3, m3
+        sbbq    d, d
+        notq    d
+        andq    d, n0
+        andq    d, n1
+        andq    d, n3
+        subq    n0, m0
+        sbbq    n1, m1
+        sbbq    $0, m2
+        sbbq    n3, m3
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        testq   k, k
+        jz      bignum_mod_n256_alt_writeback
+
+bignum_mod_n256_alt_loop:
+
+// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our
+// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1).
+
+        movq    m3, n0
+        shldq   $32, m2, n0
+        movq    m3, q
+        shrq    $32, q
+
+        xorq    %rdx, %rdx
+        subq    $1, %rdx
+
+        adcq    m2, n0
+        adcq    m3, q
+        sbbq    n0, n0
+        orq     n0, q
+
+// Load the next digit so current m to reduce = [m3;m2;m1;m0;d]
+
+        movq    -8(x,k,8), d
+
+// Now form [m3;m2;m1;m0;d] = m - q * n_256
+
+        subq    q, m3
+
+        movq    $0x0c46353d039cdaaf, %rax
+        mulq    q
+        addq    %rax, d
+        adcq    %rdx, m0
+        adcq    $0, m1
+        adcq    $0, m2
+        adcq    $0, m3
+
+        movq    $0x4319055258e8617b, %rax
+        mulq    q
+        addq    %rax, m0
+        adcq    %rdx, m1
+        adcq    $0, m2
+        adcq    $0, m3
+
+        movq    $0x00000000ffffffff, %rax
+        mulq    q
+        addq    %rax, m2
+        adcq    %rdx, m3
+
+// Now our top word m3 is either zero or all 1s. Use it for a masked
+// addition of n_256, which we can do by a *subtraction* of
+// 2^256 - n_256 from our portion
+
+        movq    $0x0c46353d039cdaaf, n0
+        andq    m3, n0
+        movq    $0x4319055258e8617b, n1
+        andq    m3, n1
+        movl    $0x00000000ffffffff, n3short
+        andq    m3, n3
+
+        subq    n0, d
+        sbbq    n1, m0
+        sbbq    $0, m1
+        sbbq    n3, m2
+
+// Now shuffle registers up and loop
+
+        movq    m2, m3
+        movq    m1, m2
+        movq    m0, m1
+        movq    d, m0
+
+        decq    k
+        jnz     bignum_mod_n256_alt_loop
+
+// Write back
+
+bignum_mod_n256_alt_writeback:
+
+        movq    m0, (z)
+        movq    m1, 8(z)
+        movq    m2, 16(z)
+        movq    m3, 24(z)
+
+// Restore registers and return
+
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+bignum_mod_n256_alt_shortinput:
+
+        xorq    m0, m0
+        xorq    m1, m1
+        xorq    m2, m2
+        xorq    m3, m3
+
+        testq   k, k
+        jz      bignum_mod_n256_alt_writeback
+        movq    (%rdx), m0
+        decq    k
+        jz      bignum_mod_n256_alt_writeback
+        movq    8(%rdx), m1
+        decq    k
+        jz      bignum_mod_n256_alt_writeback
+        movq    16(%rdx), m2
+        jmp     bignum_mod_n256_alt_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256.S
new file mode 100644
index 00000000000..19576bc4a51
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256.S
@@ -0,0 +1,198 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_256
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_p256
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = k, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256)
+        .text
+
+#define z %rdi
+#define k %rsi
+#define x %rcx
+
+#define m0 %r8
+#define m1 %r9
+#define m2 %r10
+#define m3 %r11
+#define d %r12
+
+#define n0 %rax
+#define n1 %rbx
+#define n3 %rdx
+#define q %rdx
+
+#define n0short %eax
+#define n1short %ebx
+
+
+S2N_BN_SYMBOL(bignum_mod_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save extra registers
+
+        pushq   %rbx
+        pushq   %r12
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmpq    $4, k
+        jc      bignum_mod_p256_shortinput
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        subq    $4, k
+        movq    24(%rdx,k,8), m3
+        movq    16(%rdx,k,8), m2
+        movq    8(%rdx,k,8), m1
+        movq    (%rdx,k,8), m0
+
+// Move x into another register to leave %rdx free for multiplies and use of n3
+
+        movq    %rdx, x
+
+// Load non-trivial digits [n3; 0; n1; -1] = p_256 and do a conditional
+// subtraction to reduce the four starting digits [m3;m2;m1;m0] modulo p_256
+
+        subq    $-1, m0
+        movl    $0x00000000ffffffff, n1short
+        sbbq    n1, m1
+        movq    $0xffffffff00000001, n3
+        sbbq    $0, m2
+        sbbq    n3, m3
+
+        sbbq    n0, n0
+
+        andq    n0, n1
+        andq    n0, n3
+        addq    n0, m0
+        adcq    n1, m1
+        adcq    $0, m2
+        adcq    n3, m3
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        testq   k, k
+        jz      bignum_mod_p256_writeback
+
+bignum_mod_p256_loop:
+
+// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our
+// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1).
+
+        movq    m3, n0
+        shldq   $32, m2, n0
+        movq    m3, q
+        shrq    $32, q
+
+        xorq    n1, n1
+        subq    $1, n1
+
+        adcq    m2, n0
+        adcq    m3, q
+        sbbq    n0, n0
+        orq     n0, q
+
+// Load the next digit so current m to reduce = [m3;m2;m1;m0;d]
+
+        movq    -8(x,k,8), d
+
+// Now compute the initial pre-reduced [m3;m2;m1;m0;d] = m - p_256 * q
+// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q
+// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q
+
+        addq    q, d
+        movq    $0x0000000100000000, n0
+        mulxq   n0, n0, n1
+        sbbq    $0, n0
+        sbbq    $0, n1
+        subq    n0, m0
+        sbbq    n1, m1
+        movq    $0xffffffff00000001, n0
+        mulxq   n0, n0, n1
+        sbbq    n0, m2
+        sbbq    n1, m3
+
+// Now our top word m3 is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative, as a bitmask
+// Do a masked addition of p_256
+
+        movl    $0x00000000ffffffff, n0short
+        andq    m3, n0
+        xorq    n1, n1
+        subq    n0, n1
+        addq    m3, d
+        adcq    n0, m0
+        adcq    $0, m1
+        adcq    n1, m2
+
+// Shuffle registers up and loop
+
+        movq    m2, m3
+        movq    m1, m2
+        movq    m0, m1
+        movq    d, m0
+
+        decq    k
+        jnz     bignum_mod_p256_loop
+
+// Write back
+
+bignum_mod_p256_writeback:
+
+        movq    m0, (z)
+        movq    m1, 8(z)
+        movq    m2, 16(z)
+        movq    m3, 24(z)
+
+// Restore registers and return
+
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+bignum_mod_p256_shortinput:
+
+        xorq    m0, m0
+        xorq    m1, m1
+        xorq    m2, m2
+        xorq    m3, m3
+
+        testq   k, k
+        jz      bignum_mod_p256_writeback
+        movq    (%rdx), m0
+        decq    k
+        jz      bignum_mod_p256_writeback
+        movq    8(%rdx), m1
+        decq    k
+        jz      bignum_mod_p256_writeback
+        movq    16(%rdx), m2
+        jmp     bignum_mod_p256_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_4.S
new file mode 100644
index 00000000000..f87013791fb
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_4.S
@@ -0,0 +1,88 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_p256_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256_4)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %r11
+#define c %rax
+
+#define n1short %r10d
+
+
+
+S2N_BN_SYMBOL(bignum_mod_p256_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the input and subtract to get [d3;d3;d1;d1] = x - p_256 (modulo 2^256)
+// The constants n1 and n3 in [n3; 0; n1; -1] = p_256 are saved for later
+
+        movq    (x), d0
+        subq    $-1, d0
+        movq    8(x), d1
+        movl    $0x00000000ffffffff, n1short
+        sbbq    n1, d1
+        movq    16(x), d2
+        sbbq    $0, d2
+        movq    $0xffffffff00000001, n3
+        movq    24(x), d3
+        sbbq    n3, d3
+
+// Capture the carry to determine whether to add back p_256, and use
+// it to create a masked p_256' = [n3; 0; n1; c]
+
+        sbbq    c, c
+        andq    c, n1
+        andq    c, n3
+
+// Do the corrective addition and copy to output
+
+        addq    c, d0
+        movq    d0, (z)
+        adcq    n1, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_alt.S
new file mode 100644
index 00000000000..7ae9566d4ec
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_alt.S
@@ -0,0 +1,202 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_256
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_p256_alt
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = k, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256_alt)
+        .text
+
+#define z %rdi
+#define k %rsi
+#define x %rcx
+
+#define m0 %r8
+#define m1 %r9
+#define m2 %r10
+#define m3 %r11
+#define d %r12
+
+#define n0 %rax
+
+#define n1 %rbx
+#define q %rbx
+
+#define n3 %rdx
+
+#define n0short %eax
+#define n1short %ebx
+
+
+S2N_BN_SYMBOL(bignum_mod_p256_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save extra registers
+
+        pushq   %rbx
+        pushq   %r12
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmpq    $4, k
+        jc      bignum_mod_p256_alt_shortinput
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        subq    $4, k
+        movq    24(%rdx,k,8), m3
+        movq    16(%rdx,k,8), m2
+        movq    8(%rdx,k,8), m1
+        movq    (%rdx,k,8), m0
+
+// Move x into another register to leave %rdx free for multiplies and use of n3
+
+        movq    %rdx, x
+
+// Load non-trivial digits [n3; 0; n1; -1] = p_256 and do a conditional
+// subtraction to reduce the four starting digits [m3;m2;m1;m0] modulo p_256
+
+        subq    $-1, m0
+        movl    $0x00000000ffffffff, n1short
+        sbbq    n1, m1
+        movq    $0xffffffff00000001, n3
+        sbbq    $0, m2
+        sbbq    n3, m3
+
+        sbbq    n0, n0
+
+        andq    n0, n1
+        andq    n0, n3
+        addq    n0, m0
+        adcq    n1, m1
+        adcq    $0, m2
+        adcq    n3, m3
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        testq   k, k
+        jz      bignum_mod_p256_alt_writeback
+
+bignum_mod_p256_alt_loop:
+
+// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our
+// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1).
+
+        movq    m3, n0
+        shldq   $32, m2, n0
+        movq    m3, q
+        shrq    $32, q
+
+        xorq    n3, n3
+        subq    $1, n3
+
+        adcq    m2, n0
+        adcq    m3, q
+        sbbq    n0, n0
+        orq     n0, q
+
+// Load the next digit so current m to reduce = [m3;m2;m1;m0;d]
+
+        movq    -8(x,k,8), d
+
+// Now compute the initial pre-reduced [m3;m2;m1;m0;d] = m - p_256 * q
+// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q
+// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q
+
+        movq    $0x0000000100000000, %rax
+        mulq    q
+        addq    q, d
+        sbbq    $0, %rax
+        sbbq    $0, %rdx
+        subq    %rax, m0
+        sbbq    %rdx, m1
+        sbbq    $0, m2
+        sbbq    $0, m3
+        movq    $0xffffffff00000001, %rax
+        mulq    q
+        subq    %rax, m2
+        sbbq    %rdx, m3
+
+// Now our top word m3 is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative, as a bitmask
+// Do a masked addition of p_256
+
+        movl    $0x00000000ffffffff, n0short
+        andq    m3, n0
+        xorq    n1, n1
+        subq    n0, n1
+        addq    m3, d
+        adcq    n0, m0
+        adcq    $0, m1
+        adcq    n1, m2
+
+// Shuffle registers up and loop
+
+        movq    m2, m3
+        movq    m1, m2
+        movq    m0, m1
+        movq    d, m0
+
+        decq    k
+        jnz     bignum_mod_p256_alt_loop
+
+// Write back
+
+bignum_mod_p256_alt_writeback:
+
+        movq    m0, (z)
+        movq    m1, 8(z)
+        movq    m2, 16(z)
+        movq    m3, 24(z)
+
+// Restore registers and return
+
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+bignum_mod_p256_alt_shortinput:
+
+        xorq    m0, m0
+        xorq    m1, m1
+        xorq    m2, m2
+        xorq    m3, m3
+
+        testq   k, k
+        jz      bignum_mod_p256_alt_writeback
+        movq    (%rdx), m0
+        decq    k
+        jz      bignum_mod_p256_alt_writeback
+        movq    8(%rdx), m1
+        decq    k
+        jz      bignum_mod_p256_alt_writeback
+        movq    16(%rdx), m2
+        jmp     bignum_mod_p256_alt_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montinv_p256.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montinv_p256.S
index 36f5d376e0c..b595645db9f 100644
--- a/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montinv_p256.S
@@ -1017,6 +1017,7 @@
         leaq    (%rax,%rdx), %r12
 
 S2N_BN_SYMBOL(bignum_montinv_p256):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256.S
new file mode 100644
index 00000000000..5267c29f3b8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256.S
@@ -0,0 +1,191 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_256
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_p256
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in
+// the "usual" case x < p_256 and y < p_256).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// We move the y argument here so we can use %rdx for multipliers
+
+#define y %rcx
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rbx as temporaries
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rbx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rbx, high
+
+S2N_BN_SYMBOL(bignum_montmul_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Do row 0 computation, which is a bit different:
+// set up initial window [%r12,%r11,%r10,%r9,%r8] = y[0] * x
+// Unlike later, we only need a single carry chain
+
+        xorl    %r13d, %r13d
+        movq    (y), %rdx
+        mulxq   (x), %r8, %r9
+        mulxq   8(x), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   16(x), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   24(x), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+
+// Add row 1
+
+        movq    8(y), %rdx
+        xorl    %r14d, %r14d
+        mulpadd(%r10,%r9,(x))
+        mulpadd(%r11,%r10,8(x))
+        mulpadd(%r12,%r11,16(x))
+        mulpadd(%r13,%r12,24(x))
+        adcq   %r14, %r13
+
+// Montgomery reduce windows 0 and 1 together
+
+        xorl    %r15d, %r15d
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%r10,%r9,%r8)
+        mulpadd(%r11,%r10,%r9)
+        notq    %rdx
+        leaq    2(%rdx), %rdx
+        mulpadd(%r12,%r11,%r8)
+        mulpadd(%r13,%r12,%r9)
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+
+// Add row 2
+
+        movq    16(y), %rdx
+        xorl    %r8d, %r8d
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        mulpadd(%r13,%r12,16(x))
+        adoxq   %r8, %r14
+        mulxq   24(x), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+
+// Add row 3
+
+        movq    24(y), %rdx
+        xorl    %r9d, %r9d
+        mulpadd(%r12,%r11,(x))
+        mulpadd(%r13,%r12,8(x))
+        mulpadd(%r14,%r13,16(x))
+        adoxq   %r9, %r15
+        mulxq   24(x), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+
+// Montgomery reduce windows 2 and 3 together
+
+        xorl    %r9d, %r9d
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%r12,%r11,%r10)
+        mulpadd(%r13,%r12,%r11)
+        notq    %rdx
+        leaq    2(%rdx), %rdx
+        mulpadd(%r14,%r13,%r10)
+        mulpadd(%r15,%r14,%r11)
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+
+// We now have a pre-reduced 5-word form [%r8; %r15;%r14;%r13;%r12]
+// Load [%rax;%r11;%rbx;%rdx;%rcx] = 2^320 - p_256, re-using earlier numbers a bit
+// Do [%rax;%r11;%rbx;%rdx;%rcx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256)
+
+        movl    $1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0x00000000fffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+
+// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256
+// where r is the pre-reduced form. So conditionally select the
+// output accordingly.
+
+        cmovcq  %rcx, %r12
+        cmovcq  %rdx, %r13
+        cmovcq  %r9, %r14
+        cmovcq  %r11, %r15
+
+// Write back reduced value
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256_alt.S
new file mode 100644
index 00000000000..9161da2cdb7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256_alt.S
@@ -0,0 +1,214 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_256
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_p256_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in
+// the "usual" case x < p_256 and y < p_256).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// We move the y argument here so we can use %rdx for multipliers
+
+#define y %rcx
+
+// Add %rbx * m into a register-pair (high,low) maintaining consistent
+// carry-catching with carry (negated, as bitmask) and using %rax and %rdx
+// as temporaries
+
+#define mulpadd(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rbx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// Initial version assuming no carry-in
+
+#define mulpadi(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rbx;                    \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// End version not catching the top carry-out
+
+#define mulpade(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rbx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high
+
+S2N_BN_SYMBOL(bignum_montmul_p256_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Do row 0 computation, which is a bit different:
+// set up initial window [%r12,%r11,%r10,%r9,%r8] = y[0] * x
+// Unlike later, we only need a single carry chain
+
+        movq    (y), %rbx
+        movq    (x), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+
+        movq    8(x), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+
+        movq    16(x), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+
+        movq    24(x), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+
+// Add row 1
+
+        movq    8(y), %rbx
+        xorl    %r13d, %r13d
+        mulpadi(%r14,%r10,%r9,(x))
+        mulpadd(%r14,%r11,%r10,8(x))
+        mulpadd(%r14,%r12,%r11,16(x))
+        mulpade(%r14,%r13,%r12,24(x))
+
+// Montgomery reduce windows 0 and 1 together
+
+        xorl    %r14d, %r14d
+        movq    $0x0000000100000000, %rbx
+        mulpadi(%r15,%r10,%r9,%r8)
+        mulpadd(%r15,%r11,%r10,%r9)
+        notq    %rbx
+        leaq    2(%rbx), %rbx
+        mulpadd(%r15,%r12,%r11,%r8)
+        mulpade(%r15,%r13,%r12,%r9)
+        adcq    %r14, %r14
+
+// Add row 2
+
+        movq    16(y), %rbx
+        xorl    %r15d, %r15d
+        mulpadi(%r8,%r11,%r10,(x))
+        mulpadd(%r8,%r12,%r11,8(x))
+        mulpadd(%r8,%r13,%r12,16(x))
+        mulpade(%r8,%r14,%r13,24(x))
+        adcq    %r15, %r15
+
+// Add row 3
+
+        movq    24(y), %rbx
+        xorl    %r8d, %r8d
+        mulpadi(%r9,%r12,%r11,(x))
+        mulpadd(%r9,%r13,%r12,8(x))
+        mulpadd(%r9,%r14,%r13,16(x))
+        mulpade(%r9,%r15,%r14,24(x))
+        adcq    %r8, %r8
+
+// Montgomery reduce windows 2 and 3 together
+
+        xorl    %r9d, %r9d
+        movq    $0x0000000100000000, %rbx
+        mulpadi(%rcx,%r12,%r11,%r10)
+        mulpadd(%rcx,%r13,%r12,%r11)
+        notq    %rbx
+        leaq    2(%rbx), %rbx
+        mulpadd(%rcx,%r14,%r13,%r10)
+        mulpade(%rcx,%r15,%r14,%r11)
+        adcq    %r9, %r8
+
+// We now have a pre-reduced 5-word form [%r8; %r15;%r14;%r13;%r12]
+// Load [%rax;%r11;%r9;%rbx;%rcx] = 2^320 - p_256, re-using earlier numbers a bit
+// Do [%rax;%r11;%r9;%rbx;%rcx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256)
+
+        movl    $1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0x00000000fffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+
+// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256
+// where r is the pre-reduced form. So conditionally select the
+// output accordingly.
+
+        cmovcq  %rcx, %r12
+        cmovcq  %rbx, %r13
+        cmovcq  %r9, %r14
+        cmovcq  %r11, %r15
+
+// Write back reduced value
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256.S
new file mode 100644
index 00000000000..ca2c79c9997
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256.S
@@ -0,0 +1,189 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is
+// guaranteed in particular if x < p_256 initially (the "intended" case).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Use this fairly consistently for a zero
+
+#define zero %rbp
+#define zeroe %ebp
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rbx as temporaries
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rbx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rbx, high
+
+S2N_BN_SYMBOL(bignum_montsqr_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Compute [%r15;%r8] = [00] which we use later, but mainly
+// set up an initial window [%r14;...;%r9] = [23;03;01]
+
+        movq    (x), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   8(x), %r9, %r10
+        mulxq   24(x), %r11, %r12
+        movq    16(x), %rdx
+        mulxq   24(x), %r13, %r14
+
+// Clear our zero register, and also initialize the flags for the carry chain
+
+        xorl    zeroe, zeroe
+
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        movq    24(x), %rdx
+        mulpadd(%r13,%r12,8(x))
+        adcxq   zero, %r13
+        adoxq   zero, %r14
+        adcq    zero, %r14
+
+// Double and add to the 00 + 11 + 22 + 33 terms
+
+        xorl    zeroe, zeroe
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    8(x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    16(x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    24(x), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   zero, %r15
+        adoxq   zero, %r15
+
+// First two waves of Montgomery reduction. Consolidate the double carries
+// in %r9 and propagate up to the top in %r8, which is no longer needed otherwise.
+
+        xorl    zeroe, zeroe
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%r10,%r9,%r8)
+        mulpadd(%r11,%r10,%r9)
+        movq    $0xffffffff00000001, %rdx
+        mulpadd(%r12,%r11,%r8)
+        mulpadd(%r13,%r12,%r9)
+        adcxq   zero, %r13
+        movl    zeroe, %r9d
+        adoxq   zero, %r9
+        adcxq   zero, %r9
+        addq    %r9, %r14
+        adcq    zero, %r15
+        movl    zeroe, %r8d
+        adcq    zero, %r8
+
+// Now two more steps of Montgomery reduction, again with %r8 = top carry
+
+        xorl    zeroe, zeroe
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%r12,%r11,%r10)
+        mulpadd(%r13,%r12,%r11)
+        movq    $0xffffffff00000001, %rdx
+        mulpadd(%r14,%r13,%r10)
+        mulpadd(%r15,%r14,%r11)
+        adcxq   zero, %r15
+        adoxq   zero, %r8
+        adcq    zero, %r8
+
+// Load [%rax;%r11;%rbp;%rdx;%rcx] = 2^320 - p_256, re-using earlier numbers a bit
+// Do [%rax;%r11;%rbp;%rdx;%rcx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256)
+
+        movl    $1, %ecx
+        addq    %r12, %rcx
+        leaq    -1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -1(%rbp), %rbp
+        movq    %rbp, %rax
+        adcq    %r14, %rbp
+        movl    $0x00000000fffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+
+// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256
+// where r is the pre-reduced form. So conditionally select the
+// output accordingly.
+
+        cmovcq  %rcx, %r12
+        cmovcq  %rdx, %r13
+        cmovcq  %rbp, %r14
+        cmovcq  %r11, %r15
+
+// Write back reduced value
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore saved registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256_alt.S
new file mode 100644
index 00000000000..688560cc56f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256_alt.S
@@ -0,0 +1,212 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_p256_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is
+// guaranteed in particular if x < p_256 initially (the "intended" case).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Add %rbx * m into a register-pair (high,low) maintaining consistent
+// carry-catching with carry (negated, as bitmask) and using %rax and %rdx
+// as temporaries
+
+#define mulpadd(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rbx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// Initial version assuming no carry-in
+
+#define mulpadi(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rbx;                    \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// End version not catching the top carry-out
+
+#define mulpade(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rbx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high
+
+S2N_BN_SYMBOL(bignum_montsqr_p256_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Compute [%r15;%r8] = [00] which we use later, but mainly
+// set up an initial window [%r14;...;%r9] = [23;03;01]
+
+        movq    (x), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    8(x), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    24(x), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    16(x), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        mulpadi(%rcx,%r11,%r10,(x))
+        mulpadd(%rcx,%r12,%r11,8(x))
+        movq    24(x), %rbx
+        mulpade(%rcx,%r13,%r12,8(x))
+        adcq    $0, %r14
+
+// Double the window [%r14;...;%r9], catching top carry in %rcx
+
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+
+// Add to the 00 + 11 + 22 + 33 terms
+
+        movq    8(x), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    16(x), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    24(x), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+
+// First two waves of Montgomery reduction, now re-using %r8 for top carry
+
+        movq    $0x0000000100000000, %rbx
+        mulpadi(%rcx,%r10,%r9,%r8)
+        mulpadd(%rcx,%r11,%r10,%r9)
+        notq    %rbx
+        leaq    2(%rbx), %rbx
+        mulpadd(%rcx,%r12,%r11,%r8)
+        xorl    %r8d, %r8d
+        mulpade(%rcx,%r13,%r12,%r9)
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+
+// Now two more steps of Montgomery reduction, again with %r8 = top carry
+
+        movq    $0x0000000100000000, %rbx
+        mulpadi(%rcx,%r12,%r11,%r10)
+        mulpadd(%rcx,%r13,%r12,%r11)
+        notq    %rbx
+        leaq    2(%rbx), %rbx
+        mulpadd(%rcx,%r14,%r13,%r10)
+        xorl    %r9d, %r9d
+        mulpade(%rcx,%r15,%r14,%r11)
+        adcq    %r9, %r8
+
+// Load [%rax;%r11;%r9;%rbx;%rcx] = 2^320 - p_256, re-using earlier numbers a bit
+// Do [%rax;%r11;%r9;%rbx;%rcx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256)
+
+        movl    $1, %ecx
+        addq    %r12, %rcx
+        leaq    -1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0x00000000fffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+
+// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256
+// where r is the pre-reduced form. So conditionally select the
+// output accordingly.
+
+        cmovcq  %rcx, %r12
+        cmovcq  %rbx, %r13
+        cmovcq  %r9, %r14
+        cmovcq  %r11, %r15
+
+// Write back reduced value
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore saved registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mux_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mux_4.S
new file mode 100644
index 00000000000..709b6d4998f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mux_4.S
@@ -0,0 +1,74 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero)
+// Inputs p, x[4], y[4]; output z[4]
+//
+//    extern void bignum_mux_4
+//     (uint64_t p, uint64_t z[static 4],
+//      uint64_t x[static 4], uint64_t y[static 4]);
+//
+// It is assumed that all numbers x, y and z have the same size 4 digits.
+//
+// Standard x86-64 ABI: RDI = p, RSI = z, RDX = x, RCX = y
+// Microsoft x64 ABI:   RCX = p, RDX = z, R8 = x, R9 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux_4)
+        .text
+
+#define p %rdi
+#define z %rsi
+#define x %rdx
+#define y %rcx
+#define a %rax
+#define b %r8
+
+
+S2N_BN_SYMBOL(bignum_mux_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+#endif
+        testq   p, p
+
+        movq    (x), a
+        movq    (y), b
+        cmovzq  b, a
+        movq    a, (z)
+
+        movq    8(x), a
+        movq    8(y), b
+        cmovzq  b, a
+        movq    a, 8(z)
+
+        movq    16(x), a
+        movq    16(y), b
+        cmovzq  b, a
+        movq    a, 16(z)
+
+        movq    24(x), a
+        movq    24(y), b
+        cmovzq  b, a
+        movq    a, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_neg_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_neg_p256.S
new file mode 100644
index 00000000000..d5ddab107b9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_neg_p256.S
@@ -0,0 +1,93 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negate modulo p_256, z := (-x) mod p_256, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_neg_p256 (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define q %rdx
+
+#define d0 %rax
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %r11
+
+#define d0short %eax
+#define n1short %r10d
+
+S2N_BN_SYMBOL(bignum_neg_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the input digits as [d3;d2;d1;d0] and also set a bitmask q
+// for the input being nonzero, so that we avoid doing -0 = p_256
+// and hence maintain strict modular reduction
+
+        movq    (x), d0
+        movq    8(x), d1
+        movq    d0, n1
+        orq     d1, n1
+        movq    16(x), d2
+        movq    24(x), d3
+        movq    d2, n3
+        orq     d3, n3
+        orq     n1, n3
+        negq    n3
+        sbbq    q, q
+
+// Load the non-trivial words of p_256 = [n3;0;n1;-1] and mask them with q
+
+        movl    $0x00000000ffffffff, n1short
+        movq    $0xffffffff00000001, n3
+        andq    q, n1
+        andq    q, n3
+
+// Do the subtraction, getting it as [n3;d0;n1;q] to avoid moves
+
+        subq    d0, q
+        movl    $0, d0short
+        sbbq    d1, n1
+        sbbq    d2, d0
+        sbbq    d3, n3
+
+// Write back
+
+        movq    q, (z)
+        movq    n1, 8(z)
+        movq    d0, 16(z)
+        movq    n3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_nonzero_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_nonzero_4.S
new file mode 100644
index 00000000000..0daab9a2177
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_nonzero_4.S
@@ -0,0 +1,58 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// 256-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero
+// Input x[4]; output function return
+//
+//    extern uint64_t bignum_nonzero_4(uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = x, returns RAX
+// Microsoft x64 ABI:   RCX = x, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_nonzero_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_nonzero_4)
+        .text
+
+#define x %rdi
+#define a %rax
+#define d %rdx
+#define dshort %edx
+
+
+
+S2N_BN_SYMBOL(bignum_nonzero_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+#endif
+
+// Generate a = an OR of all the words in the bignum
+
+        movq    (x), a
+        movq    8(x), d
+        orq     16(x), a
+        orq     24(x), d
+        orq     d, a
+
+// Set a standard C condition based on whether a is nonzero
+
+        movl    $1, dshort
+        cmovnzq d, a
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_optneg_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_optneg_p256.S
new file mode 100644
index 00000000000..91a1b95f697
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_optneg_p256.S
@@ -0,0 +1,102 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or
+// z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+//
+//    extern void bignum_optneg_p256
+//     (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = p, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = p, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p256)
+        .text
+
+#define z %rdi
+#define q %rsi
+#define x %rdx
+
+#define n0 %rax
+#define n1 %rcx
+#define n2 %r8
+#define n3 %r9
+
+#define n1short %ecx
+
+S2N_BN_SYMBOL(bignum_optneg_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Adjust q by zeroing it if the input is zero (to avoid giving -0 = p_256,
+// which is not strictly reduced even though it's correct modulo p_256).
+// This step is redundant if we know a priori that the input is nonzero, which
+// is the case for the y coordinate of points on the P-256 curve, for example.
+
+        movq    (x), n0
+        orq     8(x), n0
+        movq    16(x), n1
+        orq     24(x), n1
+        orq     n1, n0
+        negq    n0
+        sbbq    n0, n0
+        andq    n0, q
+
+// Turn q into a bitmask, all 1s for q=false, all 0s for q=true
+
+        negq    q
+        sbbq    q, q
+        notq    q
+
+// Let [n3;n2;n1;n0] = if q then p_256 else -1
+
+        movq    $0xffffffffffffffff, n0
+        movl    $0x00000000ffffffff, n1short
+        orq     q, n1
+        movq    q, n2
+        movq    $0xffffffff00000001, n3
+        orq     q, n3
+
+// Subtract so [n3;n2;n1;n0] = if q then p_256 - x else -1 - x
+
+        subq    (x), n0
+        sbbq    8(x), n1
+        sbbq    16(x), n2
+        sbbq    24(x), n3
+
+// XOR the words with the bitmask, which in the case q = false has the
+// effect of restoring ~(-1 - x) = -(-1 - x) - 1 = 1 + x - 1 = x
+// and write back the digits to the output
+
+        xorq    q, n0
+        movq    n0, (z)
+        xorq    q, n1
+        movq    n1, 8(z)
+        xorq    q, n2
+        movq    n2, 16(z)
+        xorq    q, n3
+        movq    n3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_sub_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_sub_p256.S
new file mode 100644
index 00000000000..3cccec875ed
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_sub_p256.S
@@ -0,0 +1,89 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo p_256, z := (x - y) mod p_256
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_sub_p256
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+#define y %rdx
+
+#define d0 %rax
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %rdx
+#define c %r11
+
+#define n1short %r10d
+
+
+
+S2N_BN_SYMBOL(bignum_sub_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Load and subtract the two inputs as [d3;d2;d1;d0] = x - y (modulo 2^256)
+
+        movq    (x), d0
+        subq    (y), d0
+        movq    8(x), d1
+        sbbq    8(y), d1
+        movq    16(x), d2
+        sbbq    16(y), d2
+        movq    24(x), d3
+        sbbq    24(y), d3
+
+// Capture the carry, which indicates x < y, and create corresponding masked
+// correction p_256' = [n3; 0; n1; c] to add
+
+        movl    $0x00000000ffffffff, n1short
+        sbbq    c, c
+        xorq    n3, n3
+        andq    c, n1
+        subq    n1, n3
+
+// Do the corrective addition and copy to output
+
+        addq    c, d0
+        movq    d0, (z)
+        adcq    n1, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256.S
new file mode 100644
index 00000000000..f748a82d27a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256.S
@@ -0,0 +1,191 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert to Montgomery form z := (2^256 * x) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_tomont_p256
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Some temp registers for the last correction stage
+
+#define d %rax
+#define u %rdx
+#define v %rcx
+
+#define dshort %eax
+#define ushort %edx
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rbx as temporaries
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rcx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rcx, high
+
+S2N_BN_SYMBOL(bignum_tomont_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// We are essentially just doing a Montgomery multiplication of x and the
+// precomputed constant y = 2^512 mod p, so the code is almost the same
+// modulo a few registers and the change from loading y[i] to using constants.
+// Because there is no y pointer to keep, we use one register less.
+
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Do row 0 computation, which is a bit different:
+// set up initial window [%r12,%r11,%r10,%r9,%r8] = y[0] * x
+// Unlike later, we only need a single carry chain
+
+        xorq    %r13, %r13
+        movl    $0x0000000000000003, %edx
+        mulxq   (x), %r8, %r9
+        mulxq   8(x), %rcx, %r10
+        adcxq   %rcx, %r9
+        mulxq   16(x), %rcx, %r11
+        adcxq   %rcx, %r10
+        mulxq   24(x), %rcx, %r12
+        adcxq   %rcx, %r11
+        adcxq   %r13, %r12
+
+// Add row 1
+
+        movq    $0xfffffffbffffffff, %rdx
+        xorq    %r14, %r14
+        mulpadd(%r10,%r9,(x))
+        mulpadd(%r11,%r10,8(x))
+        mulpadd(%r12,%r11,16(x))
+        mulpadd(%r13,%r12,24(x))
+        adcq   %r14, %r13
+
+// Montgomery reduce windows 0 and 1 together
+
+        xorq    %r15, %r15
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%r10,%r9,%r8)
+        mulpadd(%r11,%r10,%r9)
+        movq    $0xffffffff00000001, %rdx
+        mulpadd(%r12,%r11,%r8)
+        mulpadd(%r13,%r12,%r9)
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcxq   %r15, %r14
+
+// Add row 2
+
+        movq    $0xfffffffffffffffe, %rdx
+        xorq    %r8, %r8
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        mulpadd(%r13,%r12,16(x))
+        mulpadd(%r14,%r13,24(x))
+        adcxq   %r8, %r14
+        adoxq   %r8, %r15
+        adcxq   %r8, %r15
+
+// Add row 3
+
+        movq    $0x00000004fffffffd, %rdx
+        xorq    %r9, %r9
+        mulpadd(%r12,%r11,(x))
+        mulpadd(%r13,%r12,8(x))
+        mulpadd(%r14,%r13,16(x))
+        mulpadd(%r15,%r14,24(x))
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcxq   %r9, %r8
+
+// Montgomery reduce windows 2 and 3 together
+
+        xorq    %r9, %r9
+        movq    $0x0000000100000000, %rdx
+        mulpadd(%r12,%r11,%r10)
+        mulpadd(%r13,%r12,%r11)
+        movq    $0xffffffff00000001, %rdx
+        mulpadd(%r14,%r13,%r10)
+        mulpadd(%r15,%r14,%r11)
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcxq   %r9, %r8
+
+// We now have a pre-reduced 5-word form [%r8; %r15;%r14;%r13;%r12]
+// Load non-trivial digits of p_256 = [v; 0; u; -1]
+
+        movl    $0x00000000ffffffff, ushort
+        movq    $0xffffffff00000001, v
+
+// Now do the subtraction (0,p_256-1) - (%r8,%r15,%r14,%r13,%r12) to get the carry
+
+        movq    $-2, d
+        subq    %r12, d
+        movq    u, d
+        sbbq    %r13, d
+        movl    $0, dshort
+        sbbq    %r14, d
+        movq    v, d
+        sbbq    %r15, d
+
+// This last last comparison in the chain will actually even set the mask
+// for us, so we don't need to separately create it from the carry.
+// This means p_256 - 1 < (c,d1,d0,d5,d4), i.e. we are so far >= p_256
+
+        movl    $0, dshort
+        sbbq    %r8, d
+        andq    d, u
+        andq    d, v
+
+// Do a masked subtraction of p_256 and write back
+
+        subq    d, %r12
+        sbbq    u, %r13
+        sbbq    $0, %r14
+        sbbq    v, %r15
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256_alt.S
new file mode 100644
index 00000000000..15a10edc19c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256_alt.S
@@ -0,0 +1,199 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert to Montgomery form z := (2^256 * x) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_tomont_p256_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Add %rcx * m into a register-pair (high,low) maintaining consistent
+// carry-catching with carry (negated, as bitmask) and using %rax and %rdx
+// as temporaries
+
+#define mulpadd(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rcx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// Initial version assuming no carry-in
+
+#define mulpadi(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rcx;                    \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// End version not catching the top carry-out
+
+#define mulpade(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rcx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high
+
+S2N_BN_SYMBOL(bignum_tomont_p256_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Do row 0 computation, which is a bit different:
+// set up initial window [%r12,%r11,%r10,%r9,%r8] = y[0] * x
+// Unlike later, we only need a single carry chain
+
+        movl    $0x0000000000000003, %ecx
+        movq    (x), %rax
+        mulq    %rcx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+
+        movq    8(x), %rax
+        mulq    %rcx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+
+        movq    16(x), %rax
+        mulq    %rcx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+
+        movq    24(x), %rax
+        mulq    %rcx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+
+// Add row 1
+
+        movq    $0xfffffffbffffffff, %rcx
+        xorl    %r13d, %r13d
+        mulpadi(%r14,%r10,%r9,(x))
+        mulpadd(%r14,%r11,%r10,8(x))
+        mulpadd(%r14,%r12,%r11,16(x))
+        mulpade(%r14,%r13,%r12,24(x))
+
+// Montgomery reduce windows 0 and 1 together
+
+        xorl    %r14d, %r14d
+        movq    $0x0000000100000000, %rcx
+        mulpadi(%r15,%r10,%r9,%r8)
+        mulpadd(%r15,%r11,%r10,%r9)
+        notq    %rcx
+        leaq    2(%rcx), %rcx
+        mulpadd(%r15,%r12,%r11,%r8)
+        mulpade(%r15,%r13,%r12,%r9)
+        adcq    %r14, %r14
+
+// Add row 2
+
+        movq    $0xfffffffffffffffe, %rcx
+        xorl    %r15d, %r15d
+        mulpadi(%r8,%r11,%r10,(x))
+        mulpadd(%r8,%r12,%r11,8(x))
+        mulpadd(%r8,%r13,%r12,16(x))
+        mulpade(%r8,%r14,%r13,24(x))
+        adcq    %r15, %r15
+
+// Add row 3
+
+        movq    $0x00000004fffffffd, %rcx
+        xorl    %r8d, %r8d
+        mulpadi(%r9,%r12,%r11,(x))
+        mulpadd(%r9,%r13,%r12,8(x))
+        mulpadd(%r9,%r14,%r13,16(x))
+        mulpade(%r9,%r15,%r14,24(x))
+        adcq    %r8, %r8
+
+// Montgomery reduce windows 2 and 3 together
+
+        movq    $0x0000000100000000, %rcx
+        mulpadi(%r9,%r12,%r11,%r10)
+        mulpadd(%r9,%r13,%r12,%r11)
+        notq    %rcx
+        leaq    2(%rcx), %rcx
+        mulpadd(%r9,%r14,%r13,%r10)
+        mulpadd(%r9,%r15,%r14,%r11)
+        subq    %r9, %r8
+
+// We now have a pre-reduced 5-word form [%r8; %r15;%r14;%r13;%r12]
+// Load [%rax;%r11;%r9;%rcx;%rdx] = 2^320 - p_256, re-using earlier numbers a bit
+// Do [%rax;%r11;%r9;%rcx;%rdx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256)
+
+        xorl    %edx, %edx
+        leaq    -1(%rdx), %r9
+        incq    %rdx
+        addq    %r12, %rdx
+        decq    %rcx
+        adcq    %r13, %rcx
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0x00000000fffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+
+// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256
+// where r is the pre-reduced form. So conditionally select the
+// output accordingly.
+
+        cmovcq  %rdx, %r12
+        cmovcq  %rcx, %r13
+        cmovcq  %r9, %r14
+        cmovcq  %r11, %r15
+
+// Write back reduced value
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256.S
new file mode 100644
index 00000000000..0893b1004f0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256.S
@@ -0,0 +1,132 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Triple modulo p_256, z := (3 * x) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_triple_p256
+//      (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The input x can be any 4-digit bignum, not necessarily reduced modulo p_256,
+// and the result is always fully reduced, i.e. z = (3 * x) mod p_256.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Main digits of intermediate results
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// Quotient estimate = top of product + 1
+
+#define q %rdx
+
+// Other temporary variables and their short version
+
+#define a %rax
+#define c %rcx
+
+#define ashort %eax
+#define cshort %ecx
+#define qshort %edx
+
+S2N_BN_SYMBOL(bignum_triple_p256):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// First do the multiplication by 3, getting z = [h; d3; ...; d0]
+// but immediately form the quotient estimate q = h + 1
+
+        xorl    ashort, ashort
+
+        movq    (x), q
+        movq    q, d0
+        adcxq   q, q
+        adoxq   q, d0
+        movq    8(x), q
+        movq    q, d1
+        adcxq   q, q
+        adoxq   q, d1
+        movq    16(x), q
+        movq    q, d2
+        adcxq   q, q
+        adoxq   q, d2
+        movq    24(x), q
+        movq    q, d3
+        adcxq   q, q
+        adoxq   q, d3
+
+// For this limited range a simple quotient estimate of q = h + 1 works, where
+// h = floor(z / 2^256). Then -p_256 <= z - q * p_256 < p_256, so we just need
+// to subtract q * p_256 and then if that's negative, add back p_256.
+
+        movl    $1, qshort
+        adcxq   a, q
+        adoxq   a, q
+
+// Now compute the initial pre-reduced result z - p_256 * q
+// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q
+// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q
+
+        addq    q, d0
+        movq    $0x0000000100000000, a
+        mulxq   a, a, c
+        sbbq    $0, a
+        sbbq    $0, c
+        subq    a, d1
+        sbbq    c, d2
+        movq    $0xffffffff00000001, a
+        mulxq   a, a, c
+        sbbq    a, d3
+        sbbq    c, q
+
+// q is now effectively the top word of the 5-digits result; this step
+// compensates for q = h + 1
+
+        decq    q
+
+// Use that as a bitmask for a masked addition of p_256 and write back
+
+        movl    $0x00000000ffffffff, ashort
+        andq    q, a
+        xorl    cshort, cshort
+        subq    a, c
+        addq    q, d0
+        movq    d0, (z)
+        adcq    a, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    c, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256_alt.S
new file mode 100644
index 00000000000..01221e75f33
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256_alt.S
@@ -0,0 +1,137 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Triple modulo p_256, z := (3 * x) mod p_256
+// Input x[4]; output z[4]
+//
+//    extern void bignum_triple_p256_alt
+//      (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The input x can be any 4-digit bignum, not necessarily reduced modulo p_256,
+// and the result is always fully reduced, i.e. z = (3 * x) mod p_256.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Main digits of intermediate results
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// Quotient estimate = top of product + 1
+
+#define q %rcx
+
+// Other temporary variables and their short version
+
+#define a %rax
+#define c %rcx
+#define d %rdx
+
+#define ashort %eax
+#define cshort %ecx
+#define qshort %ecx
+#define dshort %edx
+
+S2N_BN_SYMBOL(bignum_triple_p256_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// First do the multiplication by 3, getting z = [h; d3; ...; d0]
+// but immediately form the quotient estimate q = h + 1
+
+        movl    $3, cshort
+
+        movq    (x), a
+        mulq    c
+        movq    a, d0
+        movq    d, d1
+
+        movq    8(x), a
+        xorq    d2, d2
+        mulq    c
+        addq    a, d1
+        adcq    d, d2
+
+        movq    16(x), a
+        xorq    d3, d3
+        mulq    c
+        addq    a, d2
+        adcq    d, d3
+
+        movq    24(x), a
+        mulq    c
+        addq    a, d3
+
+// For this limited range a simple quotient estimate of q = h + 1 works, where
+// h = floor(z / 2^256). Then -p_256 <= z - q * p_256 < p_256, so we just need
+// to subtract q * p_256 and then if that's negative, add back p_256.
+
+        movl    $1, qshort
+        adcq    d, q
+
+// Now compute the initial pre-reduced result z - p_256 * q
+// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q
+// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q
+// Since q is small just use q<<32 for 0x0000000100000000 * q.
+
+        movq    $0xffffffff00000001, a
+        mulq    q
+        movq    q, x
+        shlq    $32, x
+        addq    q, d0
+        sbbq    $0, x
+        subq    x, d1
+        sbbq    $0, d2
+        sbbq    a, d3
+        sbbq    d, q
+
+// q is now effectively the top word of the 5-digit result; this step
+// compensates for q = h + 1
+
+        decq    q
+
+// Use that as a bitmask for a masked addition of p_256 and write back
+
+        movl    $0x00000000ffffffff, ashort
+        andq    q, a
+        xorl    dshort, dshort
+        subq    a, d
+        addq    q, d0
+        movq    d0, (z)
+        adcq    a, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    d, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd.S
new file mode 100644
index 00000000000..2363e448b8c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd.S
@@ -0,0 +1,589 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// which needs to be set up explicitly before use.
+// The first two hold initially, and the second is
+// set up by copying the initial %rdx input to %rbp.
+// Thereafter, no code macro modifies any of them.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+#define z_2 (2*NUMSIZE)(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define x1a (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define z2sq (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define y1a (NUMSIZE*6)(%rsp)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds exactly to bignum_montmul_p256
+
+#define montmul_p256(P0,P1,P2)                  \
+        xorl    %r13d, %r13d ;                      \
+        movq    P2, %rdx ;                       \
+        mulxq   P1, %r8, %r9 ;                     \
+        mulxq   0x8+P1, %rbx, %r10 ;               \
+        adcq    %rbx, %r9 ;                         \
+        mulxq   0x10+P1, %rbx, %r11 ;              \
+        adcq    %rbx, %r10 ;                        \
+        mulxq   0x18+P1, %rbx, %r12 ;              \
+        adcq    %rbx, %r11 ;                        \
+        adcq    %r13, %r12 ;                        \
+        movq    0x8+P2, %rdx ;                   \
+        xorl    %r14d, %r14d ;                      \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x18+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcq    %r14, %r13 ;                        \
+        xorl    %r15d, %r15d ;                      \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        notq    %rdx;                            \
+        leaq    0x2(%rdx), %rdx ;                  \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %r15, %r13 ;                        \
+        adoxq   %r15, %r14 ;                        \
+        adcq    %r15, %r14 ;                        \
+        movq    0x10+P2, %rdx ;                  \
+        xorl    %r8d, %r8d ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adoxq   %r8, %r14 ;                         \
+        mulxq   0x18+P1, %rax, %rbx ;              \
+        adcq    %rax, %r13 ;                        \
+        adcq    %rbx, %r14 ;                        \
+        adcq    %r8, %r15 ;                         \
+        movq    0x18+P2, %rdx ;                  \
+        xorl    %r9d, %r9d ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        adoxq   %r9, %r15 ;                         \
+        mulxq   0x18+P1, %rax, %rbx ;              \
+        adcq    %rax, %r14 ;                        \
+        adcq    %rbx, %r15 ;                        \
+        adcq    %r9, %r8 ;                          \
+        xorl    %r9d, %r9d ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        notq    %rdx;                            \
+        leaq    0x2(%rdx), %rdx ;                  \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        adcxq   %r9, %r15 ;                         \
+        adoxq   %r9, %r8 ;                          \
+        adcq    %r9, %r8 ;                          \
+        movl    $0x1, %ecx ;                        \
+        addq    %r12, %rcx ;                        \
+        decq    %rdx;                            \
+        adcq    %r13, %rdx ;                        \
+        decq    %r9;                             \
+        movq    %r9, %rax ;                         \
+        adcq    %r14, %r9 ;                         \
+        movl    $0xfffffffe, %r11d ;                \
+        adcq    %r15, %r11 ;                        \
+        adcq    %r8, %rax ;                         \
+        cmovbq  %rcx, %r12 ;                        \
+        cmovbq  %rdx, %r13 ;                        \
+        cmovbq  %r9, %r14 ;                         \
+        cmovbq  %r11, %r15 ;                        \
+        movq    %r12, P0 ;                       \
+        movq    %r13, 0x8+P0 ;                   \
+        movq    %r14, 0x10+P0 ;                  \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_montsqr_p256 except for
+// register tweaks to avoid modifying %rbp.
+
+#define montsqr_p256(P0,P1)                     \
+        movq    P1, %rdx ;                       \
+        mulxq   %rdx, %r8, %r15 ;                     \
+        mulxq   0x8+P1, %r9, %r10 ;                \
+        mulxq   0x18+P1, %r11, %r12 ;              \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   0x18+P1, %r13, %r14 ;              \
+        xorl    %ecx, %ecx ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        adoxq   %rcx, %r14 ;                        \
+        adcq    %rcx, %r14 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        adcxq   %r9, %r9 ;                          \
+        adoxq   %r15, %r9 ;                         \
+        movq    0x8+P1, %rdx ;                   \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r10, %r10 ;                        \
+        adoxq   %rax, %r10 ;                        \
+        adcxq   %r11, %r11 ;                        \
+        adoxq   %rdx, %r11 ;                        \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r12, %r12 ;                        \
+        adoxq   %rax, %r12 ;                        \
+        adcxq   %r13, %r13 ;                        \
+        adoxq   %rdx, %r13 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %r15 ;                    \
+        adcxq   %r14, %r14 ;                        \
+        adoxq   %rax, %r14 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        adoxq   %rcx, %r15 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        movl    %ecx, %r9d ;                        \
+        adoxq   %rcx, %r9 ;                         \
+        adcxq   %rcx, %r9 ;                         \
+        addq    %r9, %r14 ;                         \
+        adcq    %rcx, %r15 ;                        \
+        movl    %ecx, %r8d ;                        \
+        adcq    %rcx, %r8 ;                         \
+        xorl    %ecx, %ecx ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        adoxq   %rcx, %r8 ;                         \
+        adcq    %rcx, %r8 ;                         \
+        movl    $0x1, %ebx ;                        \
+        addq    %r12, %rbx ;                        \
+        leaq    -0x1(%rdx), %rdx ;                  \
+        adcq    %r13, %rdx ;                        \
+        leaq    -0x1(%rcx), %rcx ;                  \
+        movq    %rcx, %rax ;                        \
+        adcq    %r14, %rcx ;                        \
+        movl    $0xfffffffe, %r11d ;                \
+        adcq    %r15, %r11 ;                        \
+        adcq    %r8, %rax ;                         \
+        cmovbq  %rbx, %r12 ;                        \
+        cmovbq  %rdx, %r13 ;                        \
+        cmovbq  %rcx, %r14 ;                        \
+        cmovbq  %r11, %r15 ;                        \
+        movq    %r12, P0 ;                       \
+        movq    %r13, 0x8+P0 ;                   \
+        movq    %r14, 0x10+P0 ;                  \
+        movq    %r15, 0x18+P0
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe).
+// Again, the basic squaring code is tweaked to avoid modifying %rbp.
+
+#define amontsqr_p256(P0,P1)                    \
+        movq    P1, %rdx ;                       \
+        mulxq   %rdx, %r8, %r15 ;                     \
+        mulxq   0x8+P1, %r9, %r10 ;                \
+        mulxq   0x18+P1, %r11, %r12 ;              \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   0x18+P1, %r13, %r14 ;              \
+        xorl    %ecx, %ecx ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        adoxq   %rcx, %r14 ;                        \
+        adcq    %rcx, %r14 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        adcxq   %r9, %r9 ;                          \
+        adoxq   %r15, %r9 ;                         \
+        movq    0x8+P1, %rdx ;                   \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r10, %r10 ;                        \
+        adoxq   %rax, %r10 ;                        \
+        adcxq   %r11, %r11 ;                        \
+        adoxq   %rdx, %r11 ;                        \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r12, %r12 ;                        \
+        adoxq   %rax, %r12 ;                        \
+        adcxq   %r13, %r13 ;                        \
+        adoxq   %rdx, %r13 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %r15 ;                    \
+        adcxq   %r14, %r14 ;                        \
+        adoxq   %rax, %r14 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        adoxq   %rcx, %r15 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        movl    %ecx, %r9d ;                        \
+        adoxq   %rcx, %r9 ;                         \
+        adcxq   %rcx, %r9 ;                         \
+        addq    %r9, %r14 ;                         \
+        adcq    %rcx, %r15 ;                        \
+        movl    %ecx, %r8d ;                        \
+        adcq    %rcx, %r8 ;                         \
+        xorl    %ecx, %ecx ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        adoxq   %rcx, %r8 ;                         \
+        adcq    %rcx, %r8 ;                         \
+        movl    $0x1, %r8d ;                        \
+        leaq    -0x1(%rdx), %rdx ;                  \
+        leaq    -0x1(%rcx), %rax ;                  \
+        movl    $0xfffffffe, %r11d ;                \
+        cmovzq  %rcx, %r8 ;                         \
+        cmovzq  %rcx, %rdx ;                        \
+        cmovzq  %rcx, %rax ;                        \
+        cmovzq  %rcx, %r11 ;                        \
+        addq    %r8, %r12 ;                         \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %rax, %r14 ;                        \
+        adcq    %r11, %r15 ;                        \
+        movq    %r12, P0 ;                       \
+        movq    %r13, 0x8+P0 ;                   \
+        movq    %r14, 0x10+P0 ;                  \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        movq    P1, %rax ;                       \
+        subq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        sbbq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        sbbq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        sbbq    0x18+P2, %r9 ;                   \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r11, %r11 ;                        \
+        xorq    %rdx, %rdx ;                        \
+        andq    %r11, %r10 ;                        \
+        subq    %r10, %rdx ;                        \
+        addq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        adcq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        adcq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        adcq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+#define czload4(r0,r1,r2,r3,P)                  \
+        cmovzq  P, r0 ;                        \
+        cmovzq  8+P, r1 ;                      \
+        cmovzq  16+P, r2 ;                     \
+        cmovzq  24+P, r3
+
+#define muxload4(r0,r1,r2,r3,P0,P1,P2)          \
+        movq    P0, r0 ;                       \
+        cmovbq  P1, r0 ;                       \
+        cmovnbe P2, r0 ;                       \
+        movq    8+P0, r1 ;                     \
+        cmovbq  8+P1, r1 ;                     \
+        cmovnbe 8+P2, r1 ;                     \
+        movq    16+P0, r2 ;                    \
+        cmovbq  16+P1, r2 ;                    \
+        cmovnbe 16+P2, r2 ;                    \
+        movq    24+P0, r3 ;                    \
+        cmovbq  24+P1, r3 ;                    \
+        cmovnbe 24+P2, r3
+
+S2N_BN_SYMBOL(p256_montjadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it lasts as long as it's needed.
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+// 12 * multiply + 4 * square + 7 * subtract
+
+        amontsqr_p256(z1sq,z_1)
+        amontsqr_p256(z2sq,z_2)
+
+        montmul_p256(y1a,z_2,y_1)
+        montmul_p256(y2a,z_1,y_2)
+
+        montmul_p256(x2a,z1sq,x_2)
+        montmul_p256(x1a,z2sq,x_1)
+        montmul_p256(y2a,z1sq,y2a)
+        montmul_p256(y1a,z2sq,y1a)
+
+        sub_p256(xd,x2a,x1a)
+        sub_p256(yd,y2a,y1a)
+
+        amontsqr_p256(zz,xd)
+        montsqr_p256(ww,yd)
+
+        montmul_p256(zzx1,zz,x1a)
+        montmul_p256(zzx2,zz,x2a)
+
+        sub_p256(resx,ww,zzx1)
+        sub_p256(t1,zzx2,zzx1)
+
+        montmul_p256(xd,xd,z_1)
+
+        sub_p256(resx,resx,zzx2)
+
+        sub_p256(t2,zzx1,resx)
+
+        montmul_p256(t1,t1,y1a)
+
+        montmul_p256(resz,xd,z_2)
+        montmul_p256(t2,yd,t2)
+
+        sub_p256(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0)
+// and "B"  <=> CF          <=> ~(P1 = 0) /\ P2 = 0
+// and "Z"  <=> ZF          <=> (P1 = 0 <=> P2 = 0)
+
+        load4(%r8,%r9,%r10,%r11,z_1)
+
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+
+        load4(%r12,%r13,%r14,%r15,z_2)
+
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+
+        cmpq    %rax, %rbx
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+
+        czload4(%r12,%r13,%r14,%r15,resz)
+
+        muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2)
+        muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2)
+
+// Finally store back the multiplexed values
+
+        store4(x_3,%rax,%rbx,%rcx,%rdx)
+        store4(y_3,%r8,%r9,%r10,%r11)
+        store4(z_3,%r12,%r13,%r14,%r15)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd_alt.S
new file mode 100644
index 00000000000..7dfa8b10aa3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd_alt.S
@@ -0,0 +1,574 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// which needs to be set up explicitly before use.
+// The first two hold initially, and the second is
+// set up by copying the initial %rdx input to %rbp.
+// Thereafter, no code macro modifies any of them.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+#define z_2 (2*NUMSIZE)(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define x1a (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define z2sq (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define y1a (NUMSIZE*6)(%rsp)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds exactly to bignum_montmul_p256_alt
+
+#define montmul_p256(P0,P1,P2)                  \
+        movq    P2, %rbx ;                      \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        xorl    %r10d, %r10d ;                     \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        xorl    %r11d, %r11d ;                     \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        xorl    %r12d, %r12d ;                     \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        movq    0x8+P2, %rbx ;                  \
+        xorl    %r13d, %r13d ;                     \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %r14, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r14, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r14, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r14, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        xorl    %r14d, %r14d ;                     \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %r15, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %r15, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %r15, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P2, %rbx ;                 \
+        xorl    %r15d, %r15d ;                     \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r8, %r8 ;                         \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %r8, %rdx ;                        \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r8, %r8 ;                         \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r8, %rdx ;                        \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r8, %r8 ;                         \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r8, %rdx ;                        \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P2, %rbx ;                 \
+        xorl    %r8d, %r8d ;                       \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r9, %r9 ;                         \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %r9, %rdx ;                        \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r9, %r9 ;                         \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r9, %rdx ;                        \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        sbbq    %r9, %r9 ;                         \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r9, %rdx ;                        \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        adcq    %r8, %r8 ;                         \
+        xorl    %r9d, %r9d ;                       \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        adcq    %r9, %r8 ;                         \
+        movl    $0x1, %ecx ;                       \
+        addq    %r12, %rcx ;                       \
+        decq    %rbx;                            \
+        adcq    %r13, %rbx ;                       \
+        decq    %r9;                             \
+        movq    %r9, %rax ;                        \
+        adcq    %r14, %r9 ;                        \
+        movl    $0xfffffffe, %r11d ;               \
+        adcq    %r15, %r11 ;                       \
+        adcq    %r8, %rax ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rbx, %r13 ;                       \
+        cmovbq  %r9, %r14 ;                        \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_montsqr_p256_alt
+
+#define montsqr_p256(P0,P1)                     \
+        movq    P1, %rax ;                      \
+        movq    %rax, %rbx ;                       \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r15 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        movq    %rax, %r9 ;                        \
+        movq    %rdx, %r10 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        movq    %rax, %r13 ;                       \
+        mulq    %rbx;                            \
+        movq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        movq    %rax, %rbx ;                       \
+        mulq    %r13;                            \
+        movq    %rax, %r13 ;                       \
+        movq    %rdx, %r14 ;                       \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x18+P1, %rbx ;                 \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r9, %r9 ;                         \
+        adcq    %r10, %r10 ;                       \
+        adcq    %r11, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        adcq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %r15, %r9 ;                        \
+        adcq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r14 ;                       \
+        adcq    %rcx, %rdx ;                       \
+        movq    %rdx, %r15 ;                       \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        xorl    %r8d, %r8d ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r8, %r14 ;                        \
+        adcq    %r8, %r15 ;                        \
+        adcq    %r8, %r8 ;                         \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        xorl    %r9d, %r9d ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        adcq    %r9, %r8 ;                         \
+        movl    $0x1, %ecx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    -0x1(%rbx), %rbx ;                 \
+        adcq    %r13, %rbx ;                       \
+        leaq    -0x1(%r9), %r9 ;                   \
+        movq    %r9, %rax ;                        \
+        adcq    %r14, %r9 ;                        \
+        movl    $0xfffffffe, %r11d ;               \
+        adcq    %r15, %r11 ;                       \
+        adcq    %r8, %rax ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rbx, %r13 ;                       \
+        cmovbq  %r9, %r14 ;                        \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        movq    P1, %rax ;                       \
+        subq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        sbbq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        sbbq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        sbbq    0x18+P2, %r9 ;                   \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r11, %r11 ;                        \
+        xorq    %rdx, %rdx ;                        \
+        andq    %r11, %r10 ;                        \
+        subq    %r10, %rdx ;                        \
+        addq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        adcq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        adcq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        adcq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+#define czload4(r0,r1,r2,r3,P)                  \
+        cmovzq  P, r0 ;                        \
+        cmovzq  8+P, r1 ;                      \
+        cmovzq  16+P, r2 ;                     \
+        cmovzq  24+P, r3
+
+#define muxload4(r0,r1,r2,r3,P0,P1,P2)          \
+        movq    P0, r0 ;                       \
+        cmovbq  P1, r0 ;                       \
+        cmovnbe P2, r0 ;                       \
+        movq    8+P0, r1 ;                     \
+        cmovbq  8+P1, r1 ;                     \
+        cmovnbe 8+P2, r1 ;                     \
+        movq    16+P0, r2 ;                    \
+        cmovbq  16+P1, r2 ;                    \
+        cmovnbe 16+P2, r2 ;                    \
+        movq    24+P0, r3 ;                    \
+        cmovbq  24+P1, r3 ;                    \
+        cmovnbe 24+P2, r3
+
+S2N_BN_SYMBOL(p256_montjadd_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it lasts as long as it's needed.
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+// 12 * multiply + 4 * square + 7 * subtract
+
+        montsqr_p256(z1sq,z_1)
+        montsqr_p256(z2sq,z_2)
+
+        montmul_p256(y1a,z_2,y_1)
+        montmul_p256(y2a,z_1,y_2)
+
+        montmul_p256(x2a,z1sq,x_2)
+        montmul_p256(x1a,z2sq,x_1)
+        montmul_p256(y2a,z1sq,y2a)
+        montmul_p256(y1a,z2sq,y1a)
+
+        sub_p256(xd,x2a,x1a)
+        sub_p256(yd,y2a,y1a)
+
+        montsqr_p256(zz,xd)
+        montsqr_p256(ww,yd)
+
+        montmul_p256(zzx1,zz,x1a)
+        montmul_p256(zzx2,zz,x2a)
+
+        sub_p256(resx,ww,zzx1)
+        sub_p256(t1,zzx2,zzx1)
+
+        montmul_p256(xd,xd,z_1)
+
+        sub_p256(resx,resx,zzx2)
+
+        sub_p256(t2,zzx1,resx)
+
+        montmul_p256(t1,t1,y1a)
+
+        montmul_p256(resz,xd,z_2)
+        montmul_p256(t2,yd,t2)
+
+        sub_p256(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0)
+// and "B"  <=> CF          <=> ~(P1 = 0) /\ P2 = 0
+// and "Z"  <=> ZF          <=> (P1 = 0 <=> P2 = 0)
+
+        load4(%r8,%r9,%r10,%r11,z_1)
+
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+
+        load4(%r12,%r13,%r14,%r15,z_2)
+
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+
+        cmpq    %rax, %rbx
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+
+        czload4(%r12,%r13,%r14,%r15,resz)
+
+        muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2)
+        muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2)
+
+// Finally store back the multiplexed values
+
+        store4(x_3,%rax,%rbx,%rcx,%rdx)
+        store4(y_3,%r8,%r9,%r10,%r11)
+        store4(z_3,%r12,%r13,%r14,%r15)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble.S
new file mode 100644
index 00000000000..ef0904c25dc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble.S
@@ -0,0 +1,630 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjdouble
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1, which is true when the
+// arguments come in initially and is not disturbed throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z2 (NUMSIZE*0)(%rsp)
+#define y4 (NUMSIZE*0)(%rsp)
+
+#define y2 (NUMSIZE*1)(%rsp)
+
+#define t1 (NUMSIZE*2)(%rsp)
+
+#define t2 (NUMSIZE*3)(%rsp)
+#define x2p (NUMSIZE*3)(%rsp)
+#define dx2 (NUMSIZE*3)(%rsp)
+
+#define xy2 (NUMSIZE*4)(%rsp)
+
+#define x4p (NUMSIZE*5)(%rsp)
+#define d (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds exactly to bignum_montmul_p256
+
+#define montmul_p256(P0,P1,P2)                  \
+        xorl    %r13d, %r13d ;                      \
+        movq    P2, %rdx ;                       \
+        mulxq   P1, %r8, %r9 ;                     \
+        mulxq   0x8+P1, %rbx, %r10 ;               \
+        adcq    %rbx, %r9 ;                         \
+        mulxq   0x10+P1, %rbx, %r11 ;              \
+        adcq    %rbx, %r10 ;                        \
+        mulxq   0x18+P1, %rbx, %r12 ;              \
+        adcq    %rbx, %r11 ;                        \
+        adcq    %r13, %r12 ;                        \
+        movq    0x8+P2, %rdx ;                   \
+        xorl    %r14d, %r14d ;                      \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x18+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcq    %r14, %r13 ;                        \
+        xorl    %r15d, %r15d ;                      \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        notq    %rdx;                            \
+        leaq    0x2(%rdx), %rdx ;                  \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %r15, %r13 ;                        \
+        adoxq   %r15, %r14 ;                        \
+        adcq    %r15, %r14 ;                        \
+        movq    0x10+P2, %rdx ;                  \
+        xorl    %r8d, %r8d ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adoxq   %r8, %r14 ;                         \
+        mulxq   0x18+P1, %rax, %rbx ;              \
+        adcq    %rax, %r13 ;                        \
+        adcq    %rbx, %r14 ;                        \
+        adcq    %r8, %r15 ;                         \
+        movq    0x18+P2, %rdx ;                  \
+        xorl    %r9d, %r9d ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        adoxq   %r9, %r15 ;                         \
+        mulxq   0x18+P1, %rax, %rbx ;              \
+        adcq    %rax, %r14 ;                        \
+        adcq    %rbx, %r15 ;                        \
+        adcq    %r9, %r8 ;                          \
+        xorl    %r9d, %r9d ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        notq    %rdx;                            \
+        leaq    0x2(%rdx), %rdx ;                  \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        adcxq   %r9, %r15 ;                         \
+        adoxq   %r9, %r8 ;                          \
+        adcq    %r9, %r8 ;                          \
+        movl    $0x1, %ecx ;                        \
+        addq    %r12, %rcx ;                        \
+        decq    %rdx;                            \
+        adcq    %r13, %rdx ;                        \
+        decq    %r9;                             \
+        movq    %r9, %rax ;                         \
+        adcq    %r14, %r9 ;                         \
+        movl    $0xfffffffe, %r11d ;                \
+        adcq    %r15, %r11 ;                        \
+        adcq    %r8, %rax ;                         \
+        cmovbq  %rcx, %r12 ;                        \
+        cmovbq  %rdx, %r13 ;                        \
+        cmovbq  %r9, %r14 ;                         \
+        cmovbq  %r11, %r15 ;                        \
+        movq    %r12, P0 ;                       \
+        movq    %r13, 0x8+P0 ;                   \
+        movq    %r14, 0x10+P0 ;                  \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_montsqr_p256
+
+#define montsqr_p256(P0,P1)                     \
+        movq    P1, %rdx ;                       \
+        mulxq   %rdx, %r8, %r15 ;                     \
+        mulxq   0x8+P1, %r9, %r10 ;                \
+        mulxq   0x18+P1, %r11, %r12 ;              \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   0x18+P1, %r13, %r14 ;              \
+        xorl    %ebp, %ebp ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rbp, %r13 ;                        \
+        adoxq   %rbp, %r14 ;                        \
+        adcq    %rbp, %r14 ;                        \
+        xorl    %ebp, %ebp ;                        \
+        adcxq   %r9, %r9 ;                          \
+        adoxq   %r15, %r9 ;                         \
+        movq    0x8+P1, %rdx ;                   \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r10, %r10 ;                        \
+        adoxq   %rax, %r10 ;                        \
+        adcxq   %r11, %r11 ;                        \
+        adoxq   %rdx, %r11 ;                        \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r12, %r12 ;                        \
+        adoxq   %rax, %r12 ;                        \
+        adcxq   %r13, %r13 ;                        \
+        adoxq   %rdx, %r13 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %r15 ;                    \
+        adcxq   %r14, %r14 ;                        \
+        adoxq   %rax, %r14 ;                        \
+        adcxq   %rbp, %r15 ;                        \
+        adoxq   %rbp, %r15 ;                        \
+        xorl    %ebp, %ebp ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rbp, %r13 ;                        \
+        movl    %ebp, %r9d ;                        \
+        adoxq   %rbp, %r9 ;                         \
+        adcxq   %rbp, %r9 ;                         \
+        addq    %r9, %r14 ;                         \
+        adcq    %rbp, %r15 ;                        \
+        movl    %ebp, %r8d ;                        \
+        adcq    %rbp, %r8 ;                         \
+        xorl    %ebp, %ebp ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        adcxq   %rbp, %r15 ;                        \
+        adoxq   %rbp, %r8 ;                         \
+        adcq    %rbp, %r8 ;                         \
+        movl    $0x1, %ecx ;                        \
+        addq    %r12, %rcx ;                        \
+        leaq    -0x1(%rdx), %rdx ;                  \
+        adcq    %r13, %rdx ;                        \
+        leaq    -0x1(%rbp), %rbp ;                  \
+        movq    %rbp, %rax ;                        \
+        adcq    %r14, %rbp ;                        \
+        movl    $0xfffffffe, %r11d ;                \
+        adcq    %r15, %r11 ;                        \
+        adcq    %r8, %rax ;                         \
+        cmovbq  %rcx, %r12 ;                        \
+        cmovbq  %rdx, %r13 ;                        \
+        cmovbq  %rbp, %r14 ;                        \
+        cmovbq  %r11, %r15 ;                        \
+        movq    %r12, P0 ;                       \
+        movq    %r13, 0x8+P0 ;                   \
+        movq    %r14, 0x10+P0 ;                  \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        movq    P1, %rax ;                       \
+        subq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        sbbq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        sbbq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        sbbq    0x18+P2, %r9 ;                   \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r11, %r11 ;                        \
+        xorq    %rdx, %rdx ;                        \
+        andq    %r11, %r10 ;                        \
+        subq    %r10, %rdx ;                        \
+        addq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        adcq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        adcq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        adcq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// Corresponds exactly to bignum_add_p256
+
+#define add_p256(P0,P1,P2)                      \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                       \
+        addq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        adcq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        adcq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        adcq    0x18+P2, %r9 ;                   \
+        adcq    %r11, %r11 ;                        \
+        subq    $0xffffffffffffffff, %rax ;         \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r10, %rcx ;                        \
+        sbbq    $0x0, %r8 ;                         \
+        movq    $0xffffffff00000001, %rdx ;         \
+        sbbq    %rdx, %r9 ;                         \
+        sbbq    $0x0, %r11 ;                        \
+        andq    %r11, %r10 ;                        \
+        andq    %r11, %rdx ;                        \
+        addq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        adcq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        adcq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        adcq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// A weak version of add that only guarantees sum in 4 digits
+
+#define weakadd_p256(P0,P1,P2)                  \
+        movq    P1, %rax ;                       \
+        addq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        adcq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        adcq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        adcq    0x18+P2, %r9 ;                   \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r11, %r11 ;                        \
+        xorq    %rdx, %rdx ;                        \
+        andq    %r11, %r10 ;                        \
+        subq    %r10, %rdx ;                        \
+        subq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        sbbq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        sbbq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        sbbq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// P0 = C * P1 - D * P2  computed as d * (p_256 - P2) + c * P1
+// Quotient estimation is done just as q = h + 1 as in bignum_triple_p256
+// This also applies to the other functions following.
+
+#define cmsub_p256(P0,C,P1,D,P2)                \
+        /* First (%r11;%r10;%r9;%r8) = p_256 - P2 */ \
+        movq    $0xffffffffffffffff, %r8 ;          \
+        xorl    %r10d, %r10d ;                      \
+        subq    P2, %r8 ;                        \
+        movq    $0x00000000ffffffff, %r9 ;          \
+        sbbq    0x8+P2, %r9 ;                    \
+        sbbq    0x10+P2, %r10 ;                  \
+        movq    $0xffffffff00000001, %r11 ;         \
+        sbbq    0x18+P2, %r11 ;                  \
+        /* (%r12;%r11;%r10;%r9;%r8) = D * (p_256 - P2) */  \
+        xorl    %r12d, %r12d ;                      \
+        movq    $D, %rdx ;                         \
+        mulxq   %r8, %r8, %rax ;                    \
+        mulxq   %r9, %r9, %rcx ;                    \
+        addq    %rax, %r9 ;                        \
+        mulxq   %r10, %r10, %rax ;                  \
+        adcq    %rcx, %r10 ;                       \
+        mulxq   %r11, %r11, %rcx ;                  \
+        adcq    %rax, %r11 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + C * P1 + D * (p_256 - P2) */ \
+        movq    $C, %rdx ;                         \
+        xorl    %eax, %eax ;                       \
+        mulxq   P1, %rax, %rcx ;                 \
+        adcxq   %rax, %r8 ;                        \
+        adoxq   %rcx, %r9 ;                        \
+        mulxq   0x8+P1, %rax, %rcx ;             \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rcx, %r10 ;                       \
+        mulxq   0x10+P1, %rax, %rcx ;            \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rcx, %r11 ;                       \
+        mulxq   0x18+P1, %rax, %rdx ;            \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %r12, %rdx ;                       \
+        adcq    $1, %rdx ;                         \
+        /* Now the tail for modular reduction from tripling */ \
+        addq    %rdx, %r8 ;                         \
+        movq    $0x100000000, %rax ;                \
+        mulxq   %rax, %rax, %rcx ;                    \
+        sbbq    $0x0, %rax ;                        \
+        sbbq    $0x0, %rcx ;                        \
+        subq    %rax, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        movq    $0xffffffff00000001, %rax ;         \
+        mulxq   %rax, %rax, %rcx ;                    \
+        sbbq    %rax, %r11 ;                        \
+        sbbq    %rcx, %rdx ;                        \
+        decq    %rdx;                            \
+        movl    $0xffffffff, %eax ;                 \
+        andq    %rdx, %rax ;                        \
+        xorl    %ecx, %ecx ;                        \
+        subq    %rax, %rcx ;                        \
+        addq    %rdx, %r8 ;                         \
+        movq    %r8, P0 ;                        \
+        adcq    %rax, %r9 ;                         \
+        movq    %r9, 0x8+P0 ;                    \
+        adcq    $0x0, %r10 ;                        \
+        movq    %r10, 0x10+P0 ;                  \
+        adcq    %rcx, %r11 ;                        \
+        movq    %r11, 0x18+P0
+
+// P0 = 3 * P1 - 8 * P2, computed as (p_256 - P2) << 3 + 3 * P1
+
+#define cmsub38_p256(P0,P1,P2)                  \
+        /* First (%r11;%r10;%r9;%r8) = p_256 - P2 */ \
+        movq    $0xffffffffffffffff, %r8 ;          \
+        xorl    %r10d, %r10d ;                      \
+        subq    P2, %r8 ;                        \
+        movq    $0x00000000ffffffff, %r9 ;          \
+        sbbq    0x8+P2, %r9 ;                    \
+        sbbq    0x10+P2, %r10 ;                  \
+        movq    $0xffffffff00000001, %r11 ;         \
+        sbbq    0x18+P2, %r11 ;                  \
+        /* (%r12;%r11;%r10;%r9;%r8) = (p_256 - P2) << 3 */  \
+        movq    %r11, %r12 ;                       \
+        shldq   $3, %r10, %r11 ;                    \
+        shldq   $3, %r9, %r10 ;                     \
+        shldq   $3, %r8, %r9 ;                      \
+        shlq    $3, %r8 ;                          \
+        shrq    $61, %r12 ;                        \
+        /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + 3 * P1 + 8 * (p_256 - P2) */ \
+        movq    $3, %rdx ;                         \
+        xorl    %eax, %eax ;                       \
+        mulxq   P1, %rax, %rcx ;                 \
+        adcxq   %rax, %r8 ;                        \
+        adoxq   %rcx, %r9 ;                        \
+        mulxq   0x8+P1, %rax, %rcx ;             \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rcx, %r10 ;                       \
+        mulxq   0x10+P1, %rax, %rcx ;            \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rcx, %r11 ;                       \
+        mulxq   0x18+P1, %rax, %rdx ;            \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %r12, %rdx ;                       \
+        adcq    $1, %rdx ;                         \
+        /* Now the tail for modular reduction from tripling */ \
+        addq    %rdx, %r8 ;                         \
+        movq    $0x100000000, %rax ;                \
+        mulxq   %rax, %rax, %rcx ;                    \
+        sbbq    $0x0, %rax ;                        \
+        sbbq    $0x0, %rcx ;                        \
+        subq    %rax, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        movq    $0xffffffff00000001, %rax ;         \
+        mulxq   %rax, %rax, %rcx ;                    \
+        sbbq    %rax, %r11 ;                        \
+        sbbq    %rcx, %rdx ;                        \
+        decq    %rdx;                            \
+        movl    $0xffffffff, %eax ;                 \
+        andq    %rdx, %rax ;                        \
+        xorl    %ecx, %ecx ;                        \
+        subq    %rax, %rcx ;                        \
+        addq    %rdx, %r8 ;                         \
+        movq    %r8, P0 ;                        \
+        adcq    %rax, %r9 ;                         \
+        movq    %r9, 0x8+P0 ;                    \
+        adcq    $0x0, %r10 ;                        \
+        movq    %r10, 0x10+P0 ;                  \
+        adcq    %rcx, %r11 ;                        \
+        movq    %r11, 0x18+P0
+
+// P0 = 4 * P1 - P2, by direct subtraction of P2,
+// since the quotient estimate still works safely
+// for initial value > -p_256
+
+#define cmsub41_p256(P0,P1,P2)                  \
+        movq    0x18+P1, %r11 ;                  \
+        movq    %r11, %rdx ;                       \
+        movq    0x10+P1, %r10 ;                  \
+        shldq   $2, %r10, %r11 ;                    \
+        movq    0x8+P1, %r9 ;                    \
+        shldq   $2, %r9, %r10 ;                     \
+        movq    P1, %r8 ;                        \
+        shldq   $2, %r8, %r9 ;                      \
+        shlq    $2, %r8 ;                          \
+        shrq    $62, %rdx ;                        \
+        addq    $1, %rdx ;                         \
+        subq    P2, %r8 ;                       \
+        sbbq    0x8+P2, %r9 ;                   \
+        sbbq    0x10+P2, %r10 ;                 \
+        sbbq    0x18+P2, %r11 ;                 \
+        sbbq    $0, %rdx ;                         \
+        /* Now the tail for modular reduction from tripling */ \
+        addq    %rdx, %r8 ;                         \
+        movq    $0x100000000, %rax ;                \
+        mulxq   %rax, %rax, %rcx ;                    \
+        sbbq    $0x0, %rax ;                        \
+        sbbq    $0x0, %rcx ;                        \
+        subq    %rax, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        movq    $0xffffffff00000001, %rax ;         \
+        mulxq   %rax, %rax, %rcx ;                    \
+        sbbq    %rax, %r11 ;                        \
+        sbbq    %rcx, %rdx ;                        \
+        decq    %rdx;                            \
+        movl    $0xffffffff, %eax ;                 \
+        andq    %rdx, %rax ;                        \
+        xorl    %ecx, %ecx ;                        \
+        subq    %rax, %rcx ;                        \
+        addq    %rdx, %r8 ;                         \
+        movq    %r8, P0 ;                        \
+        adcq    %rax, %r9 ;                         \
+        movq    %r9, 0x8+P0 ;                    \
+        adcq    $0x0, %r10 ;                        \
+        movq    %r10, 0x10+P0 ;                  \
+        adcq    %rcx, %r11 ;                        \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(p256_montjdouble):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room on stack for temporary variables
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+// Main code, just a sequence of basic field operations
+
+// z2 = z^2
+// y2 = y^2
+
+        montsqr_p256(z2,z_1)
+        montsqr_p256(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        sub_p256(t2,x_1,z2)
+        weakadd_p256(t1,x_1,z2)
+        montmul_p256(x2p,t1,t2)
+
+// t1 = y + z
+// xy2 = x * y^2
+// x4p = x2p^2
+
+        add_p256(t1,y_1,z_1)
+        montmul_p256(xy2,x_1,y2)
+        montsqr_p256(x4p,x2p)
+
+// t1 = (y + z)^2
+
+        montsqr_p256(t1,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_p256(d,12,xy2,9,x4p)
+        sub_p256(t1,t1,z2)
+
+// y4 = y^4
+
+        montsqr_p256(y4,y2)
+
+// dx2 = d * x2p
+
+        montmul_p256(dx2,d,x2p)
+
+// z_3' = 2 * y * z
+
+        sub_p256(z_3,t1,y2)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_p256(x_3,xy2,d)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_p256(y_3,dx2,y4)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble_alt.S
new file mode 100644
index 00000000000..6a3a5e630d9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble_alt.S
@@ -0,0 +1,743 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjdouble_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble_alt)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1, which is true when the
+// arguments come in initially and is not disturbed throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z2 (NUMSIZE*0)(%rsp)
+#define y4 (NUMSIZE*0)(%rsp)
+
+#define y2 (NUMSIZE*1)(%rsp)
+
+#define t1 (NUMSIZE*2)(%rsp)
+
+#define t2 (NUMSIZE*3)(%rsp)
+#define x2p (NUMSIZE*3)(%rsp)
+#define dx2 (NUMSIZE*3)(%rsp)
+
+#define xy2 (NUMSIZE*4)(%rsp)
+
+#define x4p (NUMSIZE*5)(%rsp)
+#define d (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds exactly to bignum_montmul_p256_alt
+
+#define montmul_p256(P0,P1,P2)                  \
+        movq    P2, %rbx ;                      \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        xorl    %r10d, %r10d ;                     \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        xorl    %r11d, %r11d ;                     \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        xorl    %r12d, %r12d ;                     \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        movq    0x8+P2, %rbx ;                  \
+        xorl    %r13d, %r13d ;                     \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %r14, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r14, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r14, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r14, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        xorl    %r14d, %r14d ;                     \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %r15, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %r15, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %r15, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P2, %rbx ;                 \
+        xorl    %r15d, %r15d ;                     \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r8, %r8 ;                         \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %r8, %rdx ;                        \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r8, %r8 ;                         \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r8, %rdx ;                        \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r8, %r8 ;                         \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r8, %rdx ;                        \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P2, %rbx ;                 \
+        xorl    %r8d, %r8d ;                       \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r9, %r9 ;                         \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %r9, %rdx ;                        \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r9, %r9 ;                         \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r9, %rdx ;                        \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        sbbq    %r9, %r9 ;                         \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r9, %rdx ;                        \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        adcq    %r8, %r8 ;                         \
+        xorl    %r9d, %r9d ;                       \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        adcq    %r9, %r8 ;                         \
+        movl    $0x1, %ecx ;                       \
+        addq    %r12, %rcx ;                       \
+        decq    %rbx;                            \
+        adcq    %r13, %rbx ;                       \
+        decq    %r9;                             \
+        movq    %r9, %rax ;                        \
+        adcq    %r14, %r9 ;                        \
+        movl    $0xfffffffe, %r11d ;               \
+        adcq    %r15, %r11 ;                       \
+        adcq    %r8, %rax ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rbx, %r13 ;                       \
+        cmovbq  %r9, %r14 ;                        \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_montsqr_p256_alt
+
+#define montsqr_p256(P0,P1)                     \
+        movq    P1, %rax ;                      \
+        movq    %rax, %rbx ;                       \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r15 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        movq    %rax, %r9 ;                        \
+        movq    %rdx, %r10 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        movq    %rax, %r13 ;                       \
+        mulq    %rbx;                            \
+        movq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        movq    %rax, %rbx ;                       \
+        mulq    %r13;                            \
+        movq    %rax, %r13 ;                       \
+        movq    %rdx, %r14 ;                       \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x18+P1, %rbx ;                 \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r9, %r9 ;                         \
+        adcq    %r10, %r10 ;                       \
+        adcq    %r11, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        adcq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %r15, %r9 ;                        \
+        adcq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r14 ;                       \
+        adcq    %rcx, %rdx ;                       \
+        movq    %rdx, %r15 ;                       \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        xorl    %r8d, %r8d ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r8, %r14 ;                        \
+        adcq    %r8, %r15 ;                        \
+        adcq    %r8, %r8 ;                         \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        xorl    %r9d, %r9d ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        adcq    %r9, %r8 ;                         \
+        movl    $0x1, %ecx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    -0x1(%rbx), %rbx ;                 \
+        adcq    %r13, %rbx ;                       \
+        leaq    -0x1(%r9), %r9 ;                   \
+        movq    %r9, %rax ;                        \
+        adcq    %r14, %r9 ;                        \
+        movl    $0xfffffffe, %r11d ;               \
+        adcq    %r15, %r11 ;                       \
+        adcq    %r8, %rax ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rbx, %r13 ;                       \
+        cmovbq  %r9, %r14 ;                        \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        movq    P1, %rax ;                       \
+        subq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        sbbq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        sbbq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        sbbq    0x18+P2, %r9 ;                   \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r11, %r11 ;                        \
+        xorq    %rdx, %rdx ;                        \
+        andq    %r11, %r10 ;                        \
+        subq    %r10, %rdx ;                        \
+        addq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        adcq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        adcq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        adcq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// Corresponds exactly to bignum_add_p256
+
+#define add_p256(P0,P1,P2)                      \
+        xorq    %r11, %r11 ;                        \
+        movq    P1, %rax ;                       \
+        addq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        adcq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        adcq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        adcq    0x18+P2, %r9 ;                   \
+        adcq    %r11, %r11 ;                        \
+        subq    $0xffffffffffffffff, %rax ;         \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r10, %rcx ;                        \
+        sbbq    $0x0, %r8 ;                         \
+        movq    $0xffffffff00000001, %rdx ;         \
+        sbbq    %rdx, %r9 ;                         \
+        sbbq    $0x0, %r11 ;                        \
+        andq    %r11, %r10 ;                        \
+        andq    %r11, %rdx ;                        \
+        addq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        adcq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        adcq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        adcq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// A weak version of add that only guarantees sum in 4 digits
+
+#define weakadd_p256(P0,P1,P2)                  \
+        movq    P1, %rax ;                       \
+        addq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        adcq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        adcq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        adcq    0x18+P2, %r9 ;                   \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r11, %r11 ;                        \
+        xorq    %rdx, %rdx ;                        \
+        andq    %r11, %r10 ;                        \
+        subq    %r10, %rdx ;                        \
+        subq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        sbbq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        sbbq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        sbbq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// P0 = C * P1 - D * P2  computed as d * (p_256 - P2) + c * P1
+// Quotient estimation is done just as q = h + 1 as in bignum_triple_p256_alt.
+// This also applies to the other functions following.
+
+#define cmsub_p256(P0,C,P1,D,P2)                \
+        /* First (%r12;%r11;%r10;%r9) = p_256 - P2 */ \
+        movq    $0xffffffffffffffff, %r9 ;          \
+        xorl    %r11d, %r11d ;                      \
+        subq    P2, %r9 ;                        \
+        movq    $0x00000000ffffffff, %r10 ;         \
+        sbbq    0x8+P2, %r10 ;                   \
+        sbbq    0x10+P2, %r11 ;                  \
+        movq    $0xffffffff00000001, %r12 ;         \
+        sbbq    0x18+P2, %r12 ;                  \
+        /* (%r12;%r11;%r10;%r9;%r8) = D * (p_256 - P2) */  \
+        movq    $D, %rcx ;                         \
+        movq    %r9, %rax ;                        \
+        mulq    %rcx;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        xorl    %r10d, %r10d ;                     \
+        mulq    %rcx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        xorl    %r11d, %r11d ;                     \
+        mulq    %rcx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        movq    %r12, %rax ;                       \
+        xorl    %r12d, %r12d ;                     \
+        mulq    %rcx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        /* (%rcx;%r11;%r10;%r9;%r8) = 2^256 + C * P1 + D * (p_256 - P2) */ \
+        movl    $C, %ecx ;                         \
+        movq    P1, %rax ;                      \
+        mulq    %rcx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        leaq    1(%r12), %rcx ;                   \
+        /* Now the tail for modular reduction from tripling */ \
+        movq    $0xffffffff00000001, %rax ;        \
+        mulq    %rcx;                            \
+        movq    %rcx, %rbx ;                       \
+        shlq    $0x20, %rbx ;                      \
+        addq    %rcx, %r8 ;                        \
+        sbbq    $0x0, %rbx ;                       \
+        subq    %rbx, %r9 ;                        \
+        sbbq    $0x0, %r10 ;                       \
+        sbbq    %rax, %r11 ;                       \
+        sbbq    %rdx, %rcx ;                       \
+        decq    %rcx;                            \
+        movl    $0xffffffff, %eax ;                \
+        andq    %rcx, %rax ;                       \
+        xorl    %edx, %edx ;                       \
+        subq    %rax, %rdx ;                       \
+        addq    %rcx, %r8 ;                        \
+        movq    %r8, P0 ;                       \
+        adcq    %rax, %r9 ;                        \
+        movq    %r9, 0x8+P0 ;                   \
+        adcq    $0x0, %r10 ;                       \
+        movq    %r10, 0x10+P0 ;                 \
+        adcq    %rdx, %r11 ;                       \
+        movq    %r11, 0x18+P0
+
+// P0 = 3 * P1 - 8 * P2, computed as (p_256 - P2) << 3 + 3 * P1
+
+#define cmsub38_p256(P0,P1,P2)                  \
+        /* First (%r11;%r10;%r9;%r8) = p_256 - P2 */ \
+        movq    $0xffffffffffffffff, %r8 ;          \
+        xorl    %r10d, %r10d ;                      \
+        subq    P2, %r8 ;                        \
+        movq    $0x00000000ffffffff, %r9 ;          \
+        sbbq    0x8+P2, %r9 ;                    \
+        sbbq    0x10+P2, %r10 ;                  \
+        movq    $0xffffffff00000001, %r11 ;         \
+        sbbq    0x18+P2, %r11 ;                  \
+        /* (%r12;%r11;%r10;%r9;%r8) = (p_256 - P2) << 3 */  \
+        movq    %r11, %r12 ;                       \
+        shldq   $3, %r10, %r11 ;                    \
+        shldq   $3, %r9, %r10 ;                     \
+        shldq   $3, %r8, %r9 ;                      \
+        shlq    $3, %r8 ;                          \
+        shrq    $61, %r12 ;                        \
+        /* (%rcx;%r11;%r10;%r9;%r8) = 2^256 + 3 * P1 + 8 * (p_256 - P2) */ \
+        movl    $3, %ecx ;                         \
+        movq    P1, %rax ;                      \
+        mulq    %rcx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        leaq    1(%r12), %rcx ;                   \
+        /* Now the tail for modular reduction from tripling */ \
+        movq    $0xffffffff00000001, %rax ;        \
+        mulq    %rcx;                            \
+        movq    %rcx, %rbx ;                       \
+        shlq    $0x20, %rbx ;                      \
+        addq    %rcx, %r8 ;                        \
+        sbbq    $0x0, %rbx ;                       \
+        subq    %rbx, %r9 ;                        \
+        sbbq    $0x0, %r10 ;                       \
+        sbbq    %rax, %r11 ;                       \
+        sbbq    %rdx, %rcx ;                       \
+        decq    %rcx;                            \
+        movl    $0xffffffff, %eax ;                \
+        andq    %rcx, %rax ;                       \
+        xorl    %edx, %edx ;                       \
+        subq    %rax, %rdx ;                       \
+        addq    %rcx, %r8 ;                        \
+        movq    %r8, P0 ;                       \
+        adcq    %rax, %r9 ;                        \
+        movq    %r9, 0x8+P0 ;                   \
+        adcq    $0x0, %r10 ;                       \
+        movq    %r10, 0x10+P0 ;                 \
+        adcq    %rdx, %r11 ;                       \
+        movq    %r11, 0x18+P0
+
+// P0 = 4 * P1 - P2, by direct subtraction of P2,
+// since the quotient estimate still works safely
+// for initial value > -p_256
+
+#define cmsub41_p256(P0,P1,P2)                  \
+        movq    0x18+P1, %r11 ;                  \
+        movq    %r11, %rcx ;                       \
+        movq    0x10+P1, %r10 ;                  \
+        shldq   $2, %r10, %r11 ;                    \
+        movq    0x8+P1, %r9 ;                    \
+        shldq   $2, %r9, %r10 ;                     \
+        movq    P1, %r8 ;                        \
+        shldq   $2, %r8, %r9 ;                      \
+        shlq    $2, %r8 ;                          \
+        shrq    $62, %rcx ;                        \
+        addq    $1, %rcx ;                         \
+        subq    P2, %r8 ;                       \
+        sbbq    0x8+P2, %r9 ;                   \
+        sbbq    0x10+P2, %r10 ;                 \
+        sbbq    0x18+P2, %r11 ;                 \
+        sbbq    $0, %rcx ;                         \
+        /* Now the tail for modular reduction from tripling */ \
+        movq    $0xffffffff00000001, %rax ;        \
+        mulq    %rcx;                            \
+        movq    %rcx, %rbx ;                       \
+        shlq    $0x20, %rbx ;                      \
+        addq    %rcx, %r8 ;                        \
+        sbbq    $0x0, %rbx ;                       \
+        subq    %rbx, %r9 ;                        \
+        sbbq    $0x0, %r10 ;                       \
+        sbbq    %rax, %r11 ;                       \
+        sbbq    %rdx, %rcx ;                       \
+        decq    %rcx;                            \
+        movl    $0xffffffff, %eax ;                \
+        andq    %rcx, %rax ;                       \
+        xorl    %edx, %edx ;                       \
+        subq    %rax, %rdx ;                       \
+        addq    %rcx, %r8 ;                        \
+        movq    %r8, P0 ;                       \
+        adcq    %rax, %r9 ;                        \
+        movq    %r9, 0x8+P0 ;                   \
+        adcq    $0x0, %r10 ;                       \
+        movq    %r10, 0x10+P0 ;                 \
+        adcq    %rdx, %r11 ;                       \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(p256_montjdouble_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room on stack for temporary variables
+
+        pushq  %rbx
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+// Main code, just a sequence of basic field operations
+
+// z2 = z^2
+// y2 = y^2
+
+        montsqr_p256(z2,z_1)
+        montsqr_p256(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        sub_p256(t2,x_1,z2)
+        weakadd_p256(t1,x_1,z2)
+        montmul_p256(x2p,t1,t2)
+
+// t1 = y + z
+// xy2 = x * y^2
+// x4p = x2p^2
+
+        add_p256(t1,y_1,z_1)
+        montmul_p256(xy2,x_1,y2)
+        montsqr_p256(x4p,x2p)
+
+// t1 = (y + z)^2
+
+        montsqr_p256(t1,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_p256(d,12,xy2,9,x4p)
+        sub_p256(t1,t1,z2)
+
+// y4 = y^4
+
+        montsqr_p256(y4,y2)
+
+// dx2 = d * x2p
+
+        montmul_p256(dx2,d,x2p)
+
+// z_3' = 2 * y * z
+
+        sub_p256(z_3,t1,y2)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_p256(x_3,xy2,d)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_p256(y_3,dx2,y4)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd.S
new file mode 100644
index 00000000000..51b1f4923b7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd.S
@@ -0,0 +1,562 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjmixadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjmixadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjmixadd)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs.
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// which needs to be set up explicitly before use.
+// The first two hold initially, and the second is
+// set up by copying the initial %rdx input to %rbp.
+// Thereafter, no code macro modifies any of them.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing.
+// NSPACE is the total stack needed for all temporaries.
+
+#define zp2 (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds exactly to bignum_montmul_p256
+
+#define montmul_p256(P0,P1,P2)                  \
+        xorl    %r13d, %r13d ;                      \
+        movq    P2, %rdx ;                       \
+        mulxq   P1, %r8, %r9 ;                     \
+        mulxq   0x8+P1, %rbx, %r10 ;               \
+        adcq    %rbx, %r9 ;                         \
+        mulxq   0x10+P1, %rbx, %r11 ;              \
+        adcq    %rbx, %r10 ;                        \
+        mulxq   0x18+P1, %rbx, %r12 ;              \
+        adcq    %rbx, %r11 ;                        \
+        adcq    %r13, %r12 ;                        \
+        movq    0x8+P2, %rdx ;                   \
+        xorl    %r14d, %r14d ;                      \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x18+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcq    %r14, %r13 ;                        \
+        xorl    %r15d, %r15d ;                      \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        notq    %rdx;                            \
+        leaq    0x2(%rdx), %rdx ;                  \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %r15, %r13 ;                        \
+        adoxq   %r15, %r14 ;                        \
+        adcq    %r15, %r14 ;                        \
+        movq    0x10+P2, %rdx ;                  \
+        xorl    %r8d, %r8d ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adoxq   %r8, %r14 ;                         \
+        mulxq   0x18+P1, %rax, %rbx ;              \
+        adcq    %rax, %r13 ;                        \
+        adcq    %rbx, %r14 ;                        \
+        adcq    %r8, %r15 ;                         \
+        movq    0x18+P2, %rdx ;                  \
+        xorl    %r9d, %r9d ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        adoxq   %r9, %r15 ;                         \
+        mulxq   0x18+P1, %rax, %rbx ;              \
+        adcq    %rax, %r14 ;                        \
+        adcq    %rbx, %r15 ;                        \
+        adcq    %r9, %r8 ;                          \
+        xorl    %r9d, %r9d ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        notq    %rdx;                            \
+        leaq    0x2(%rdx), %rdx ;                  \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        adcxq   %r9, %r15 ;                         \
+        adoxq   %r9, %r8 ;                          \
+        adcq    %r9, %r8 ;                          \
+        movl    $0x1, %ecx ;                        \
+        addq    %r12, %rcx ;                        \
+        decq    %rdx;                            \
+        adcq    %r13, %rdx ;                        \
+        decq    %r9;                             \
+        movq    %r9, %rax ;                         \
+        adcq    %r14, %r9 ;                         \
+        movl    $0xfffffffe, %r11d ;                \
+        adcq    %r15, %r11 ;                        \
+        adcq    %r8, %rax ;                         \
+        cmovbq  %rcx, %r12 ;                        \
+        cmovbq  %rdx, %r13 ;                        \
+        cmovbq  %r9, %r14 ;                         \
+        cmovbq  %r11, %r15 ;                        \
+        movq    %r12, P0 ;                       \
+        movq    %r13, 0x8+P0 ;                   \
+        movq    %r14, 0x10+P0 ;                  \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_montsqr_p256 except for
+// register tweaks to avoid modifying %rbp.
+
+#define montsqr_p256(P0,P1)                     \
+        movq    P1, %rdx ;                       \
+        mulxq   %rdx, %r8, %r15 ;                     \
+        mulxq   0x8+P1, %r9, %r10 ;                \
+        mulxq   0x18+P1, %r11, %r12 ;              \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   0x18+P1, %r13, %r14 ;              \
+        xorl    %ecx, %ecx ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        adoxq   %rcx, %r14 ;                        \
+        adcq    %rcx, %r14 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        adcxq   %r9, %r9 ;                          \
+        adoxq   %r15, %r9 ;                         \
+        movq    0x8+P1, %rdx ;                   \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r10, %r10 ;                        \
+        adoxq   %rax, %r10 ;                        \
+        adcxq   %r11, %r11 ;                        \
+        adoxq   %rdx, %r11 ;                        \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r12, %r12 ;                        \
+        adoxq   %rax, %r12 ;                        \
+        adcxq   %r13, %r13 ;                        \
+        adoxq   %rdx, %r13 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %r15 ;                    \
+        adcxq   %r14, %r14 ;                        \
+        adoxq   %rax, %r14 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        adoxq   %rcx, %r15 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        movl    %ecx, %r9d ;                        \
+        adoxq   %rcx, %r9 ;                         \
+        adcxq   %rcx, %r9 ;                         \
+        addq    %r9, %r14 ;                         \
+        adcq    %rcx, %r15 ;                        \
+        movl    %ecx, %r8d ;                        \
+        adcq    %rcx, %r8 ;                         \
+        xorl    %ecx, %ecx ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        adoxq   %rcx, %r8 ;                         \
+        adcq    %rcx, %r8 ;                         \
+        movl    $0x1, %ebx ;                        \
+        addq    %r12, %rbx ;                        \
+        leaq    -0x1(%rdx), %rdx ;                  \
+        adcq    %r13, %rdx ;                        \
+        leaq    -0x1(%rcx), %rcx ;                  \
+        movq    %rcx, %rax ;                        \
+        adcq    %r14, %rcx ;                        \
+        movl    $0xfffffffe, %r11d ;                \
+        adcq    %r15, %r11 ;                        \
+        adcq    %r8, %rax ;                         \
+        cmovbq  %rbx, %r12 ;                        \
+        cmovbq  %rdx, %r13 ;                        \
+        cmovbq  %rcx, %r14 ;                        \
+        cmovbq  %r11, %r15 ;                        \
+        movq    %r12, P0 ;                       \
+        movq    %r13, 0x8+P0 ;                   \
+        movq    %r14, 0x10+P0 ;                  \
+        movq    %r15, 0x18+P0
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe).
+// Again, the basic squaring code is tweaked to avoid modifying %rbp.
+
+#define amontsqr_p256(P0,P1)                    \
+        movq    P1, %rdx ;                       \
+        mulxq   %rdx, %r8, %r15 ;                     \
+        mulxq   0x8+P1, %r9, %r10 ;                \
+        mulxq   0x18+P1, %r11, %r12 ;              \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   0x18+P1, %r13, %r14 ;              \
+        xorl    %ecx, %ecx ;                        \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        adoxq   %rcx, %r14 ;                        \
+        adcq    %rcx, %r14 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        adcxq   %r9, %r9 ;                          \
+        adoxq   %r15, %r9 ;                         \
+        movq    0x8+P1, %rdx ;                   \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r10, %r10 ;                        \
+        adoxq   %rax, %r10 ;                        \
+        adcxq   %r11, %r11 ;                        \
+        adoxq   %rdx, %r11 ;                        \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r12, %r12 ;                        \
+        adoxq   %rax, %r12 ;                        \
+        adcxq   %r13, %r13 ;                        \
+        adoxq   %rdx, %r13 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %r15 ;                    \
+        adcxq   %r14, %r14 ;                        \
+        adoxq   %rax, %r14 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        adoxq   %rcx, %r15 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r8, %rax, %rbx ;                     \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r9, %rax, %rbx ;                     \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        movl    %ecx, %r9d ;                        \
+        adoxq   %rcx, %r9 ;                         \
+        adcxq   %rcx, %r9 ;                         \
+        addq    %r9, %r14 ;                         \
+        adcq    %rcx, %r15 ;                        \
+        movl    %ecx, %r8d ;                        \
+        adcq    %rcx, %r8 ;                         \
+        xorl    %ecx, %ecx ;                        \
+        movabsq $0x100000000, %rdx ;                \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        movabsq $0xffffffff00000001, %rdx ;         \
+        mulxq   %r10, %rax, %rbx ;                    \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   %r11, %rax, %rbx ;                    \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        adoxq   %rcx, %r8 ;                         \
+        adcq    %rcx, %r8 ;                         \
+        movl    $0x1, %r8d ;                        \
+        leaq    -0x1(%rdx), %rdx ;                  \
+        leaq    -0x1(%rcx), %rax ;                  \
+        movl    $0xfffffffe, %r11d ;                \
+        cmovzq  %rcx, %r8 ;                         \
+        cmovzq  %rcx, %rdx ;                        \
+        cmovzq  %rcx, %rax ;                        \
+        cmovzq  %rcx, %r11 ;                        \
+        addq    %r8, %r12 ;                         \
+        adcq    %rdx, %r13 ;                        \
+        adcq    %rax, %r14 ;                        \
+        adcq    %r11, %r15 ;                        \
+        movq    %r12, P0 ;                       \
+        movq    %r13, 0x8+P0 ;                   \
+        movq    %r14, 0x10+P0 ;                  \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        movq    P1, %rax ;                       \
+        subq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        sbbq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        sbbq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        sbbq    0x18+P2, %r9 ;                   \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r11, %r11 ;                        \
+        xorq    %rdx, %rdx ;                        \
+        andq    %r11, %r10 ;                        \
+        subq    %r10, %rdx ;                        \
+        addq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        adcq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        adcq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        adcq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define testzero4(P)                            \
+        movq    P, %rax ;                       \
+        movq    8+P, %rdx ;                     \
+        orq     16+P, %rax ;                    \
+        orq     24+P, %rdx ;                    \
+        orq     %rdx, %rax
+
+#define mux4(r0,r1,r2,r3,PNE,PEQ)               \
+        movq    PNE, r0 ;                      \
+        movq    PEQ, %rax ;                     \
+        cmovzq  %rax, r0 ;                        \
+        movq    8+PNE, r1 ;                    \
+        movq    8+PEQ, %rax ;                   \
+        cmovzq  %rax, r1 ;                        \
+        movq    16+PNE, r2 ;                   \
+        movq    16+PEQ, %rax ;                  \
+        cmovzq  %rax, r2 ;                        \
+        movq    24+PNE, r3 ;                   \
+        movq    24+PEQ, %rax ;                  \
+        cmovzq  %rax, r3
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+S2N_BN_SYMBOL(p256_montjmixadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it lasts as long as it's needed.
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        amontsqr_p256(zp2,z_1)
+
+        montmul_p256(y2a,z_1,y_2)
+        montmul_p256(x2a,zp2,x_2)
+        montmul_p256(y2a,zp2,y2a)
+
+        sub_p256(xd,x2a,x_1)
+
+        sub_p256(yd,y2a,y_1)
+
+        amontsqr_p256(zz,xd)
+        montsqr_p256(ww,yd)
+
+        montmul_p256(zzx1,zz,x_1)
+        montmul_p256(zzx2,zz,x2a)
+
+        sub_p256(resx,ww,zzx1)
+        sub_p256(t1,zzx2,zzx1)
+
+        montmul_p256(resz,xd,z_1)
+
+        sub_p256(resx,resx,zzx2)
+
+        sub_p256(t2,zzx1,resx)
+
+        montmul_p256(t1,t1,y_1)
+        montmul_p256(t2,yd,t2)
+
+        sub_p256(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        testzero4(z_1)
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^256 - p_256),
+// hence giving 0 + p2 = p2 for the final result.
+
+        mux4(%r8,%r9,%r10,%r11,resx,x_2)
+        mux4(%r12,%r13,%r14,%r15,resy,y_2)
+
+        store4(x_3,%r8,%r9,%r10,%r11)
+        store4(y_3,%r12,%r13,%r14,%r15)
+
+        load4(%r8,%r9,%r10,%r11,resz)
+        movl    $1, %eax
+        cmovzq  %rax, %r8
+        movq    $0xffffffff00000000, %rax
+        cmovzq  %rax, %r9
+        movq    $0xffffffffffffffff, %rax
+        cmovzq  %rax, %r10
+        movl    $0x00000000fffffffe, %eax
+        cmovzq  %rax, %r11
+
+        store4(z_3,%r8,%r9,%r10,%r11)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd_alt.S
new file mode 100644
index 00000000000..55f2dca1d3e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd_alt.S
@@ -0,0 +1,547 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates
+//
+//    extern void p256_montjmixadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjmixadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjmixadd_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs.
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// which needs to be set up explicitly before use.
+// The first two hold initially, and the second is
+// set up by copying the initial %rdx input to %rbp.
+// Thereafter, no code macro modifies any of them.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing.
+// NSPACE is the total stack needed for all temporaries.
+
+#define zp2 (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds exactly to bignum_montmul_p256_alt
+
+#define montmul_p256(P0,P1,P2)                  \
+        movq    P2, %rbx ;                      \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        xorl    %r10d, %r10d ;                     \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        xorl    %r11d, %r11d ;                     \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        xorl    %r12d, %r12d ;                     \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        movq    0x8+P2, %rbx ;                  \
+        xorl    %r13d, %r13d ;                     \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %r14, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r14, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r14, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r14, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        xorl    %r14d, %r14d ;                     \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %r15, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %r15, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %r15, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P2, %rbx ;                 \
+        xorl    %r15d, %r15d ;                     \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r8, %r8 ;                         \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %r8, %rdx ;                        \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r8, %r8 ;                         \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r8, %rdx ;                        \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r8, %r8 ;                         \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r8, %rdx ;                        \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P2, %rbx ;                 \
+        xorl    %r8d, %r8d ;                       \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %r9, %r9 ;                         \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %r9, %rdx ;                        \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r9, %r9 ;                         \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r9, %rdx ;                        \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        sbbq    %r9, %r9 ;                         \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rbx;                            \
+        subq    %r9, %rdx ;                        \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        adcq    %r8, %r8 ;                         \
+        xorl    %r9d, %r9d ;                       \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        adcq    %r9, %r8 ;                         \
+        movl    $0x1, %ecx ;                       \
+        addq    %r12, %rcx ;                       \
+        decq    %rbx;                            \
+        adcq    %r13, %rbx ;                       \
+        decq    %r9;                             \
+        movq    %r9, %rax ;                        \
+        adcq    %r14, %r9 ;                        \
+        movl    $0xfffffffe, %r11d ;               \
+        adcq    %r15, %r11 ;                       \
+        adcq    %r8, %rax ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rbx, %r13 ;                       \
+        cmovbq  %r9, %r14 ;                        \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_montsqr_p256_alt
+
+#define montsqr_p256(P0,P1)                     \
+        movq    P1, %rax ;                      \
+        movq    %rax, %rbx ;                       \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r15 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        movq    %rax, %r9 ;                        \
+        movq    %rdx, %r10 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        movq    %rax, %r13 ;                       \
+        mulq    %rbx;                            \
+        movq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        movq    %rax, %rbx ;                       \
+        mulq    %r13;                            \
+        movq    %rax, %r13 ;                       \
+        movq    %rdx, %r14 ;                       \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x18+P1, %rbx ;                 \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r9, %r9 ;                         \
+        adcq    %r10, %r10 ;                       \
+        adcq    %r11, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        adcq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %r15, %r9 ;                        \
+        adcq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r14 ;                       \
+        adcq    %rcx, %rdx ;                       \
+        movq    %rdx, %r15 ;                       \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r8, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        xorl    %r8d, %r8d ;                       \
+        movq    %r9, %rax ;                        \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r8, %r14 ;                        \
+        adcq    %r8, %r15 ;                        \
+        adcq    %r8, %r8 ;                         \
+        movq    $0x100000000, %rbx ;               \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        notq    %rbx;                            \
+        leaq    0x2(%rbx), %rbx ;                 \
+        movq    %r10, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        xorl    %r9d, %r9d ;                       \
+        movq    %r11, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        adcq    %r9, %r8 ;                         \
+        movl    $0x1, %ecx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    -0x1(%rbx), %rbx ;                 \
+        adcq    %r13, %rbx ;                       \
+        leaq    -0x1(%r9), %r9 ;                   \
+        movq    %r9, %rax ;                        \
+        adcq    %r14, %r9 ;                        \
+        movl    $0xfffffffe, %r11d ;               \
+        adcq    %r15, %r11 ;                       \
+        adcq    %r8, %rax ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rbx, %r13 ;                       \
+        cmovbq  %r9, %r14 ;                        \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256
+
+#define sub_p256(P0,P1,P2)                      \
+        movq    P1, %rax ;                       \
+        subq    P2, %rax ;                       \
+        movq    0x8+P1, %rcx ;                   \
+        sbbq    0x8+P2, %rcx ;                   \
+        movq    0x10+P1, %r8 ;                   \
+        sbbq    0x10+P2, %r8 ;                   \
+        movq    0x18+P1, %r9 ;                   \
+        sbbq    0x18+P2, %r9 ;                   \
+        movl    $0xffffffff, %r10d ;                \
+        sbbq    %r11, %r11 ;                        \
+        xorq    %rdx, %rdx ;                        \
+        andq    %r11, %r10 ;                        \
+        subq    %r10, %rdx ;                        \
+        addq    %r11, %rax ;                        \
+        movq    %rax, P0 ;                       \
+        adcq    %r10, %rcx ;                        \
+        movq    %rcx, 0x8+P0 ;                   \
+        adcq    $0x0, %r8 ;                         \
+        movq    %r8, 0x10+P0 ;                   \
+        adcq    %rdx, %r9 ;                         \
+        movq    %r9, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define testzero4(P)                            \
+        movq    P, %rax ;                       \
+        movq    8+P, %rdx ;                     \
+        orq     16+P, %rax ;                    \
+        orq     24+P, %rdx ;                    \
+        orq     %rdx, %rax
+
+#define mux4(r0,r1,r2,r3,PNE,PEQ)               \
+        movq    PNE, r0 ;                      \
+        movq    PEQ, %rax ;                     \
+        cmovzq  %rax, r0 ;                        \
+        movq    8+PNE, r1 ;                    \
+        movq    8+PEQ, %rax ;                   \
+        cmovzq  %rax, r1 ;                        \
+        movq    16+PNE, r2 ;                   \
+        movq    16+PEQ, %rax ;                  \
+        cmovzq  %rax, r2 ;                        \
+        movq    24+PNE, r3 ;                   \
+        movq    24+PEQ, %rax ;                  \
+        cmovzq  %rax, r3
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+S2N_BN_SYMBOL(p256_montjmixadd_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it lasts as long as it's needed.
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        montsqr_p256(zp2,z_1)
+
+        montmul_p256(y2a,z_1,y_2)
+        montmul_p256(x2a,zp2,x_2)
+        montmul_p256(y2a,zp2,y2a)
+
+        sub_p256(xd,x2a,x_1)
+
+        sub_p256(yd,y2a,y_1)
+
+        montsqr_p256(zz,xd)
+        montsqr_p256(ww,yd)
+
+        montmul_p256(zzx1,zz,x_1)
+        montmul_p256(zzx2,zz,x2a)
+
+        sub_p256(resx,ww,zzx1)
+        sub_p256(t1,zzx2,zzx1)
+
+        montmul_p256(resz,xd,z_1)
+
+        sub_p256(resx,resx,zzx2)
+
+        sub_p256(t2,zzx1,resx)
+
+        montmul_p256(t1,t1,y_1)
+        montmul_p256(t2,yd,t2)
+
+        sub_p256(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        testzero4(z_1)
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^256 - p_256),
+// hence giving 0 + p2 = p2 for the final result.
+
+        mux4(%r8,%r9,%r10,%r11,resx,x_2)
+        mux4(%r12,%r13,%r14,%r15,resy,y_2)
+
+        store4(x_3,%r8,%r9,%r10,%r11)
+        store4(y_3,%r12,%r13,%r14,%r15)
+
+        load4(%r8,%r9,%r10,%r11,resz)
+        movl    $1, %eax
+        cmovzq  %rax, %r8
+        movq    $0xffffffff00000000, %rax
+        cmovzq  %rax, %r9
+        movq    $0xffffffffffffffff, %rax
+        cmovzq  %rax, %r10
+        movl    $0x00000000fffffffe, %eax
+        cmovzq  %rax, %r11
+
+        store4(z_3,%r8,%r9,%r10,%r11)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul.S
index 4569646cd31..1904ca193eb 100644
--- a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul.S
@@ -76,6 +76,7 @@
         cmovzq  TAB+96*(I-1)+88(%rsp), %r15
 
 S2N_BN_SYMBOL(p256_montjscalarmul):
+        _CET_ENDBR
 
 // The Windows version literally calls the standard ABI version.
 // This simplifies the proofs since subroutine offsets are fixed.
diff --git a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul_alt.S
index b68d857e76b..1124a046fe1 100644
--- a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul_alt.S
@@ -76,6 +76,7 @@
         cmovzq  TAB+96*(I-1)+88(%rsp), %r15
 
 S2N_BN_SYMBOL(p256_montjscalarmul_alt):
+        _CET_ENDBR
 
 // The Windows version literally calls the standard ABI version.
 // This simplifies the proofs since subroutine offsets are fixed.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul.S
new file mode 100644
index 00000000000..3e426269977
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul.S
@@ -0,0 +1,6802 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Scalar multiplication for P-256
+// Input scalar[4], point[8]; output res[8]
+//
+// extern void p256_scalarmul
+//   (uint64_t res[static 8],uint64_t scalar[static 4],
+//     uint64_t point[static 8]);
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve P-256, returns the point (X,Y) = n * P. The input and output
+// are affine points, and in the case of the point at infinity as
+// the result, (0,0) is returned.
+//
+// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point
+// Microsoft x64 ABI:   RCX = res, RDX = scalar, R8 = point
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmul)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Intermediate variables on the stack. The last z2, z3 values can
+// safely be overlaid on the table, which is no longer needed at the end.
+// Uppercase syntactic variants make x86_att version simpler to generate
+
+#define SCALARB (0*NUMSIZE)
+#define scalarb (0*NUMSIZE)(%rsp)
+#define ACC (1*NUMSIZE)
+#define acc (1*NUMSIZE)(%rsp)
+#define TABENT (4*NUMSIZE)
+#define tabent (4*NUMSIZE)(%rsp)
+
+#define TAB (7*NUMSIZE)
+#define tab (7*NUMSIZE)(%rsp)
+
+#define Z2 (7*NUMSIZE)
+#define z2 (7*NUMSIZE)(%rsp)
+#define Z3 (8*NUMSIZE)
+#define z3 (8*NUMSIZE)(%rsp)
+
+#define res (31*NUMSIZE)(%rsp)
+
+#define NSPACE (32*NUMSIZE)
+
+S2N_BN_SYMBOL(p256_scalarmul):
+        _CET_ENDBR
+
+// The Windows version literally calls the standard ABI version.
+// This simplifies the proofs since subroutine offsets are fixed.
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        callq   p256_scalarmul_standard
+        popq   %rsi
+        popq   %rdi
+        ret
+
+p256_scalarmul_standard:
+#endif
+
+// Real start of the standard ABI code.
+
+        pushq   %r15
+        pushq   %r14
+        pushq   %r13
+        pushq   %r12
+        pushq   %rbp
+        pushq   %rbx
+
+        subq    $NSPACE, %rsp
+
+// Preserve the "res" and "point" input arguments. We load and process the
+// scalar immediately so we don't bother preserving that input argument.
+// Also, "point" is only needed early on and so its register gets re-used.
+
+        movq    %rdx, %rbx
+        movq    %rdi, res
+
+// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12]
+
+        movq    $0xf3b9cac2fc632551, %r12
+        movq    $0xbce6faada7179e84, %r13
+        movq    $0xffffffffffffffff, %r14
+        movq    $0xffffffff00000000, %r15
+
+// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256
+
+        movq    (%rsi), %r8
+        subq    %r12, %r8
+        movq    8(%rsi), %r9
+        sbbq    %r13, %r9
+        movq    16(%rsi), %r10
+        sbbq    %r14, %r10
+        movq    24(%rsi), %r11
+        sbbq    %r15, %r11
+
+        cmovcq  (%rsi), %r8
+        cmovcq  8(%rsi), %r9
+        cmovcq  16(%rsi), %r10
+        cmovcq  24(%rsi), %r11
+
+// Now if the top bit of the reduced scalar is set, negate it mod n_256,
+// i.e. do n |-> n_256 - n. Remember the sign in %rbp so we can
+// correspondingly negate the point below.
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        movq    %r11, %rbp
+        shrq    $63, %rbp
+        cmovnzq %r12, %r8
+        cmovnzq %r13, %r9
+        cmovnzq %r14, %r10
+        cmovnzq %r15, %r11
+
+// In either case then add the recoding constant 0x08888...888 to allow
+// signed digits.
+
+        movq    $0x8888888888888888, %rax
+        addq    %rax, %r8
+        adcq    %rax, %r9
+        adcq    %rax, %r10
+        adcq    %rax, %r11
+        btc     $63, %r11
+
+        movq    %r8, SCALARB(%rsp)
+        movq    %r9, SCALARB+8(%rsp)
+        movq    %r10, SCALARB+16(%rsp)
+        movq    %r11, SCALARB+24(%rsp)
+
+// Set the tab[0] table entry to Montgomery-Jacobian point = 1 * P
+// The z coordinate is just the Montgomery form of the constant 1.
+
+        leaq    TAB(%rsp), %rdi
+        movq    %rbx, %rsi
+        callq   p256_scalarmul_local_tomont_p256
+
+        leaq    32(%rbx), %rsi
+        leaq    TAB+32(%rsp), %rdi
+        callq   p256_scalarmul_local_tomont_p256
+
+        movl    $1, %eax
+        movq    %rax, TAB+64(%rsp)
+        movq    $0xffffffff00000000, %rdx
+        movq    %rdx, TAB+72(%rsp)
+        subq    $2, %rax
+        movq    %rax, TAB+80(%rsp)
+        movq    $0x00000000fffffffe, %rax
+        movq    %rax, TAB+88(%rsp)
+
+// If the top bit of the scalar was set, negate (y coordinate of) the point
+
+        movq    TAB+32(%rsp), %r12
+        movq    TAB+40(%rsp), %r13
+        movq    TAB+48(%rsp), %r14
+        movq    TAB+56(%rsp), %r15
+
+        xorl    %r10d, %r10d
+        leaq    -1(%r10), %r8
+        movq    $0x00000000ffffffff, %r11
+        movq    %r11, %r9
+        negq    %r11
+
+        subq    %r12, %r8
+        sbbq    %r13, %r9
+        sbbq    %r14, %r10
+        sbbq    %r15, %r11
+
+        testq   %rbp, %rbp
+        cmovzq  %r12, %r8
+        cmovzq  %r13, %r9
+        cmovzq  %r14, %r10
+        cmovzq  %r15, %r11
+
+        movq    %r8, TAB+32(%rsp)
+        movq    %r9, TAB+40(%rsp)
+        movq    %r10, TAB+48(%rsp)
+        movq    %r11, TAB+56(%rsp)
+
+// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P
+
+        leaq    TAB+96*1(%rsp), %rdi
+        leaq    TAB(%rsp), %rsi
+        callq   p256_scalarmul_local_p256_montjdouble
+
+        leaq    TAB+96*2(%rsp), %rdi
+        leaq    TAB+96*1(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   p256_scalarmul_local_p256_montjmixadd
+
+        leaq    TAB+96*3(%rsp), %rdi
+        leaq    TAB+96*1(%rsp), %rsi
+        callq   p256_scalarmul_local_p256_montjdouble
+
+        leaq    TAB+96*4(%rsp), %rdi
+        leaq    TAB+96*3(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   p256_scalarmul_local_p256_montjmixadd
+
+        leaq    TAB+96*5(%rsp), %rdi
+        leaq    TAB+96*2(%rsp), %rsi
+        callq   p256_scalarmul_local_p256_montjdouble
+
+        leaq    TAB+96*6(%rsp), %rdi
+        leaq    TAB+96*5(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   p256_scalarmul_local_p256_montjmixadd
+
+        leaq    TAB+96*7(%rsp), %rdi
+        leaq    TAB+96*3(%rsp), %rsi
+        callq   p256_scalarmul_local_p256_montjdouble
+
+// Set up accumulator as table entry for top 4 bits (constant-time indexing)
+
+        movq    SCALARB+24(%rsp), %rdi
+        shrq    $60, %rdi
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        xorl    %r8d, %r8d
+        xorl    %r9d, %r9d
+        xorl    %r10d, %r10d
+        xorl    %r11d, %r11d
+        xorl    %r12d, %r12d
+        xorl    %r13d, %r13d
+        xorl    %r14d, %r14d
+        xorl    %r15d, %r15d
+
+        .set I, 1
+.rep 8
+        cmpq    $I, %rdi
+
+        cmovzq  TAB+96*(I-1)(%rsp), %rax
+        cmovzq  TAB+96*(I-1)+8(%rsp), %rbx
+        cmovzq  TAB+96*(I-1)+16(%rsp), %rcx
+        cmovzq  TAB+96*(I-1)+24(%rsp), %rdx
+        cmovzq  TAB+96*(I-1)+32(%rsp), %r8
+        cmovzq  TAB+96*(I-1)+40(%rsp), %r9
+        cmovzq  TAB+96*(I-1)+48(%rsp), %r10
+        cmovzq  TAB+96*(I-1)+56(%rsp), %r11
+        cmovzq  TAB+96*(I-1)+64(%rsp), %r12
+        cmovzq  TAB+96*(I-1)+72(%rsp), %r13
+        cmovzq  TAB+96*(I-1)+80(%rsp), %r14
+        cmovzq  TAB+96*(I-1)+88(%rsp), %r15
+        .set    I, (I+1)
+.endr
+        movq     %rax, ACC(%rsp)
+        movq     %rbx, ACC+8(%rsp)
+        movq     %rcx, ACC+16(%rsp)
+        movq     %rdx, ACC+24(%rsp)
+        movq     %r8, ACC+32(%rsp)
+        movq     %r9, ACC+40(%rsp)
+        movq     %r10, ACC+48(%rsp)
+        movq     %r11, ACC+56(%rsp)
+        movq     %r12, ACC+64(%rsp)
+        movq     %r13, ACC+72(%rsp)
+        movq     %r14, ACC+80(%rsp)
+        movq     %r15, ACC+88(%rsp)
+
+// Main loop over size-4 bitfield
+
+        movl    $252, %ebp
+
+p256_scalarmul_loop:
+        subq    $4, %rbp
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_local_p256_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_local_p256_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_local_p256_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_local_p256_montjdouble
+
+        movq    %rbp, %rax
+        shrq    $6, %rax
+        movq    (%rsp,%rax,8), %rdi
+        movq    %rbp, %rcx
+        shrq    %cl, %rdi
+        andq    $15, %rdi
+
+        subq    $8, %rdi
+        sbbq    %rsi, %rsi // %rsi = sign of digit (-1 = negative)
+        xorq    %rsi, %rdi
+        subq    %rsi, %rdi // %rdi = absolute value of digit
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        xorl    %r8d, %r8d
+        xorl    %r9d, %r9d
+        xorl    %r10d, %r10d
+        xorl    %r11d, %r11d
+        xorl    %r12d, %r12d
+        xorl    %r13d, %r13d
+        xorl    %r14d, %r14d
+        xorl    %r15d, %r15d
+
+        .set I, 1
+.rep 8
+        cmpq    $I, %rdi
+
+        cmovzq  TAB+96*(I-1)(%rsp), %rax
+        cmovzq  TAB+96*(I-1)+8(%rsp), %rbx
+        cmovzq  TAB+96*(I-1)+16(%rsp), %rcx
+        cmovzq  TAB+96*(I-1)+24(%rsp), %rdx
+        cmovzq  TAB+96*(I-1)+32(%rsp), %r8
+        cmovzq  TAB+96*(I-1)+40(%rsp), %r9
+        cmovzq  TAB+96*(I-1)+48(%rsp), %r10
+        cmovzq  TAB+96*(I-1)+56(%rsp), %r11
+        cmovzq  TAB+96*(I-1)+64(%rsp), %r12
+        cmovzq  TAB+96*(I-1)+72(%rsp), %r13
+        cmovzq  TAB+96*(I-1)+80(%rsp), %r14
+        cmovzq  TAB+96*(I-1)+88(%rsp), %r15
+        .set    I, (I+1)
+.endr
+
+        movq     %r12, TABENT+64(%rsp)
+        movq     %r13, TABENT+72(%rsp)
+        movq     %r14, TABENT+80(%rsp)
+        movq     %r15, TABENT+88(%rsp)
+
+        xorl    %r14d, %r14d
+        leaq    -1(%r14), %r12
+        movq    $0x00000000ffffffff, %r15
+        movq    %r15, %r13
+        negq    %r15
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        testq    %rsi, %rsi
+        cmovnzq  %r12, %r8
+        cmovnzq  %r13, %r9
+        cmovnzq  %r14, %r10
+        cmovnzq  %r15, %r11
+
+        movq     %rax, TABENT(%rsp)
+        movq     %rbx, TABENT+8(%rsp)
+        movq     %rcx, TABENT+16(%rsp)
+        movq     %rdx, TABENT+24(%rsp)
+
+        movq     %r8, TABENT+32(%rsp)
+        movq     %r9, TABENT+40(%rsp)
+        movq     %r10, TABENT+48(%rsp)
+        movq     %r11, TABENT+56(%rsp)
+
+        leaq    TABENT(%rsp), %rdx
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_local_p256_montjadd
+
+        testq   %rbp, %rbp
+        jne     p256_scalarmul_loop
+
+// Let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form
+
+        leaq    Z2(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        callq   p256_scalarmul_local_montsqr_p256
+
+        leaq    Z3(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        leaq    Z2(%rsp), %rdx
+        callq   p256_scalarmul_local_montmul_p256
+
+        leaq    Z2(%rsp), %rdi
+        leaq    Z3(%rsp), %rsi
+        callq   p256_scalarmul_local_demont_p256
+
+        leaq    Z3(%rsp), %rdi
+        leaq    Z2(%rsp), %rsi
+        callq   p256_scalarmul_local_inv_p256
+
+        leaq    Z2(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        leaq    Z3(%rsp), %rdx
+        callq   p256_scalarmul_local_montmul_p256
+
+// Convert back from Jacobian (X, Y, Z) |-> (X/Z^2, Y/Z^3)
+
+        movq    res, %rdi
+        leaq    ACC(%rsp), %rsi
+        leaq    Z2(%rsp), %rdx
+        movq    %rdi, %rbx
+        callq   p256_scalarmul_local_montmul_p256
+
+        leaq    32(%rbx), %rdi
+        leaq    ACC+32(%rsp), %rsi
+        leaq    Z3(%rsp), %rdx
+        callq   p256_scalarmul_local_montmul_p256
+
+// Restore stack and registers and return
+
+        addq    $NSPACE, %rsp
+        popq    %rbx
+        popq    %rbp
+        popq    %r12
+        popq    %r13
+        popq    %r14
+        popq    %r15
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+p256_scalarmul_local_demont_p256:
+        pushq   %rbx
+        movq    (%rsi), %r8
+        movq    0x8(%rsi), %r9
+        movq    0x10(%rsi), %r10
+        movq    0x18(%rsi), %r11
+        xorq    %rbx, %rbx
+        xorq    %rsi, %rsi
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rcx
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+        mulxq   %r9, %rax, %rcx
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rcx
+        adcxq   %rax, %r11
+        adoxq   %rcx, %rbx
+        mulxq   %r9, %rax, %rcx
+        adcxq   %rax, %rbx
+        adoxq   %rcx, %rsi
+        movl    $0x0, %r8d
+        adcxq   %r8, %rsi
+        xorq    %r9, %r9
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rcx
+        adcxq   %rax, %r11
+        adoxq   %rcx, %rbx
+        mulxq   %r11, %rax, %rcx
+        adcxq   %rax, %rbx
+        adoxq   %rcx, %rsi
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rcx
+        adcxq   %rax, %rsi
+        adoxq   %rcx, %r8
+        mulxq   %r11, %rax, %rcx
+        adcxq   %rax, %r8
+        adoxq   %rcx, %r9
+        movl    $0x0, %r10d
+        adcxq   %r10, %r9
+        movq    %rbx, (%rdi)
+        movq    %rsi, 0x8(%rdi)
+        movq    %r8, 0x10(%rdi)
+        movq    %r9, 0x18(%rdi)
+        popq    %rbx
+        ret
+
+p256_scalarmul_local_inv_p256:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xf0, %rsp
+        movq    %rdi, 0xe0(%rsp)
+        xorl    %ecx, %ecx
+        movl    $0xffffffff, %edx
+        movq    %rdx, %rbx
+        leaq    -0x1(%rcx), %rax
+        negq    %rdx
+        movq    %rax, (%rsp)
+        movq    %rbx, 0x8(%rsp)
+        movq    %rcx, 0x10(%rsp)
+        movq    %rdx, 0x18(%rsp)
+        movq    %rcx, 0x20(%rsp)
+        movq    (%rsi), %r8
+        movq    0x8(%rsi), %r9
+        movq    0x10(%rsi), %r10
+        movq    0x18(%rsi), %r11
+        leaq    0x1(%rcx), %rax
+        addq    %r8, %rax
+        leaq    -0x1(%rdx), %rbx
+        adcq    %r9, %rbx
+        notq    %rcx
+        adcq    %r10, %rcx
+        notq    %rdx
+        adcq    %r11, %rdx
+        cmovaeq %r8, %rax
+        cmovaeq %r9, %rbx
+        cmovaeq %r10, %rcx
+        cmovaeq %r11, %rdx
+        movq    %rax, 0x28(%rsp)
+        movq    %rbx, 0x30(%rsp)
+        movq    %rcx, 0x38(%rsp)
+        movq    %rdx, 0x40(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, 0x48(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, 0x50(%rsp)
+        movq    %rax, 0x58(%rsp)
+        movq    %rax, 0x60(%rsp)
+        movq    %rax, 0x68(%rsp)
+        movq    $0x4000000000000, %rcx
+        movq    %rcx, 0x78(%rsp)
+        movq    %rax, 0x80(%rsp)
+        movq    %rax, 0x88(%rsp)
+        movq    %rax, 0x90(%rsp)
+        movq    $0xa,  0xb0(%rsp)
+        movq    $0x1,  0xb8(%rsp)
+        jmp     p256_scalarmul_inv_midloop
+p256_scalarmul_inv_loop:
+        movq    %r8, %r9
+        sarq    $0x3f, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        movq    %r10, %r11
+        sarq    $0x3f, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        movq    %r12, %r13
+        sarq    $0x3f, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        movq    %r14, %r15
+        sarq    $0x3f, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %rdi
+        andq    %r11, %rdi
+        addq    %rax, %rdi
+        movq    %rdi, 0xa0(%rsp)
+        movq    %r12, %rax
+        andq    %r13, %rax
+        movq    %r14, %rsi
+        andq    %r15, %rsi
+        addq    %rax, %rsi
+        movq    %rsi, 0xa8(%rsp)
+        xorl    %ebx, %ebx
+        movq    (%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    0x28(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        xorl    %ebp, %ebp
+        movq    (%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x28(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        xorl    %ecx, %ecx
+        movq    0x8(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x30(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        shrdq   $0x3b, %rbx, %rdi
+        movq    %rdi, (%rsp)
+        xorl    %edi, %edi
+        movq    0x8(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        movq    0x30(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        shrdq   $0x3b, %rbp, %rsi
+        movq    %rsi, 0x28(%rsp)
+        xorl    %esi, %esi
+        movq    0x10(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        movq    0x38(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        shrdq   $0x3b, %rcx, %rbx
+        movq    %rbx, 0x8(%rsp)
+        xorl    %ebx, %ebx
+        movq    0x10(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    0x38(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        shrdq   $0x3b, %rdi, %rbp
+        movq    %rbp, 0x30(%rsp)
+        movq    0x18(%rsp), %rax
+        xorq    %r9, %rax
+        movq    0x20(%rsp), %rbp
+        xorq    %r9, %rbp
+        andq    %r8, %rbp
+        negq    %rbp
+        mulq    %r8
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x40(%rsp), %rax
+        xorq    %r11, %rax
+        movq    0x48(%rsp), %rdx
+        xorq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbp
+        mulq    %r10
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        shrdq   $0x3b, %rsi, %rcx
+        movq    %rcx, 0x10(%rsp)
+        shrdq   $0x3b, %rbp, %rsi
+        sarq    $0x3b, %rbp
+        movq    0x18(%rsp), %rax
+        movq    %rsi, 0x18(%rsp)
+        movq    0x20(%rsp), %rsi
+        movq    %rbp, 0x20(%rsp)
+        xorq    %r13, %rax
+        xorq    %r13, %rsi
+        andq    %r12, %rsi
+        negq    %rsi
+        mulq    %r12
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        movq    0x40(%rsp), %rax
+        xorq    %r15, %rax
+        movq    0x48(%rsp), %rdx
+        xorq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rsi
+        mulq    %r14
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        shrdq   $0x3b, %rbx, %rdi
+        movq    %rdi, 0x38(%rsp)
+        shrdq   $0x3b, %rsi, %rbx
+        movq    %rbx, 0x40(%rsp)
+        sarq    $0x3b, %rsi
+        movq    %rsi, 0x48(%rsp)
+        movq    0xa0(%rsp), %rbx
+        movq    0xa8(%rsp), %rbp
+        xorl    %ecx, %ecx
+        movq    0x50(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x78(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        xorl    %esi, %esi
+        movq    0x50(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, 0x50(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    0x78(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, 0x78(%rsp)
+        xorl    %ebx, %ebx
+        movq    0x58(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    0x80(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        xorl    %ebp, %ebp
+        movq    0x58(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rcx, 0x58(%rsp)
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x80(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    %rsi, 0x80(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x60(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x88(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        xorl    %esi, %esi
+        movq    0x60(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, 0x60(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    0x88(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, 0x88(%rsp)
+        movq    0x68(%rsp), %rax
+        xorq    %r9, %rax
+        movq    %r9, %rbx
+        andq    %r8, %rbx
+        negq    %rbx
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    0x90(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbx
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rbx, %rdx
+        movq    0x68(%rsp), %rax
+        movq    %rcx, 0x68(%rsp)
+        movq    %rdx, 0x70(%rsp)
+        xorq    %r13, %rax
+        movq    %r13, %rcx
+        andq    %r12, %rcx
+        negq    %rcx
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rcx
+        movq    0x90(%rsp), %rax
+        xorq    %r15, %rax
+        movq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rcx
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rcx, %rdx
+        movq    %rsi, 0x90(%rsp)
+        movq    %rdx, 0x98(%rsp)
+        movq    $0xe000000000000000, %r8
+        addq    0x50(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x58(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x60(%rsp), %r10
+        movq    $0x2000000000000000, %r11
+        adcq    0x68(%rsp), %r11
+        movq    $0x1fffffffe0000000, %r12
+        adcq    0x70(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movq    $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movq    $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x50(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x58(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x60(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x68(%rsp)
+        movq    $0xe000000000000000, %r8
+        addq    0x78(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x80(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x88(%rsp), %r10
+        movq    $0x2000000000000000, %r11
+        adcq    0x90(%rsp), %r11
+        movq    $0x1fffffffe0000000, %r12
+        adcq    0x98(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movq    $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movq    $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x78(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x80(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x88(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x90(%rsp)
+p256_scalarmul_inv_midloop:
+        movq    0xb8(%rsp), %rsi
+        movq    (%rsp), %rdx
+        movq    0x28(%rsp), %rcx
+        movq    %rdx, %rbx
+        andq    $0xfffff, %rbx
+        movq    $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        andq    $0xfffff, %rcx
+        movq    $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    $0xfffffffffffffffe, %rax
+        xorl    %ebp, %ebp
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %rdx
+        leaq    (%rcx,%rax), %rdi
+        shlq    $0x16, %rdx
+        shlq    $0x16, %rdi
+        sarq    $0x2b, %rdx
+        sarq    $0x2b, %rdi
+        movq    $0x20000100000, %rax
+        leaq    (%rbx,%rax), %rbx
+        leaq    (%rcx,%rax), %rcx
+        sarq    $0x2a, %rbx
+        sarq    $0x2a, %rcx
+        movq    %rdx, 0xc0(%rsp)
+        movq    %rbx, 0xc8(%rsp)
+        movq    %rdi, 0xd0(%rsp)
+        movq    %rcx, 0xd8(%rsp)
+        movq    (%rsp), %r12
+        imulq   %r12, %rdi
+        imulq   %rdx, %r12
+        movq    0x28(%rsp), %r13
+        imulq   %r13, %rbx
+        imulq   %rcx, %r13
+        addq    %rbx, %r12
+        addq    %rdi, %r13
+        sarq    $0x14, %r12
+        sarq    $0x14, %r13
+        movq    %r12, %rbx
+        andq    $0xfffff, %rbx
+        movq    $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        movq    %r13, %rcx
+        andq    $0xfffff, %rcx
+        movq    $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    $0xfffffffffffffffe, %rax
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %r8
+        leaq    (%rcx,%rax), %r10
+        shlq    $0x16, %r8
+        shlq    $0x16, %r10
+        sarq    $0x2b, %r8
+        sarq    $0x2b, %r10
+        movq    $0x20000100000, %rax
+        leaq    (%rbx,%rax), %r15
+        leaq    (%rcx,%rax), %r11
+        sarq    $0x2a, %r15
+        sarq    $0x2a, %r11
+        movq    %r13, %rbx
+        movq    %r12, %rcx
+        imulq   %r8, %r12
+        imulq   %r15, %rbx
+        addq    %rbx, %r12
+        imulq   %r11, %r13
+        imulq   %r10, %rcx
+        addq    %rcx, %r13
+        sarq    $0x14, %r12
+        sarq    $0x14, %r13
+        movq    %r12, %rbx
+        andq    $0xfffff, %rbx
+        movq    $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        movq    %r13, %rcx
+        andq    $0xfffff, %rcx
+        movq    $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    0xc0(%rsp), %rax
+        imulq   %r8, %rax
+        movq    0xd0(%rsp), %rdx
+        imulq   %r15, %rdx
+        imulq   0xc8(%rsp), %r8
+        imulq   0xd8(%rsp), %r15
+        addq    %r8, %r15
+        leaq    (%rax,%rdx), %r9
+        movq    0xc0(%rsp), %rax
+        imulq   %r10, %rax
+        movq    0xd0(%rsp), %rdx
+        imulq   %r11, %rdx
+        imulq   0xc8(%rsp), %r10
+        imulq   0xd8(%rsp), %r11
+        addq    %r10, %r11
+        leaq    (%rax,%rdx), %r13
+        movq    $0xfffffffffffffffe, %rax
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %r8
+        leaq    (%rcx,%rax), %r12
+        shlq    $0x15, %r8
+        shlq    $0x15, %r12
+        sarq    $0x2b, %r8
+        sarq    $0x2b, %r12
+        movq    $0x20000100000, %rax
+        leaq    (%rbx,%rax), %r10
+        leaq    (%rcx,%rax), %r14
+        sarq    $0x2b, %r10
+        sarq    $0x2b, %r14
+        movq    %r9, %rax
+        imulq   %r8, %rax
+        movq    %r13, %rdx
+        imulq   %r10, %rdx
+        imulq   %r15, %r8
+        imulq   %r11, %r10
+        addq    %r8, %r10
+        leaq    (%rax,%rdx), %r8
+        movq    %r9, %rax
+        imulq   %r12, %rax
+        movq    %r13, %rdx
+        imulq   %r14, %rdx
+        imulq   %r15, %r12
+        imulq   %r11, %r14
+        addq    %r12, %r14
+        leaq    (%rax,%rdx), %r12
+        movq    %rsi, 0xb8(%rsp)
+        decq     0xb0(%rsp)
+        jne     p256_scalarmul_inv_loop
+        movq    (%rsp), %rax
+        movq    0x28(%rsp), %rcx
+        imulq   %r8, %rax
+        imulq   %r10, %rcx
+        addq    %rcx, %rax
+        sarq    $0x3f, %rax
+        movq    %r8, %r9
+        sarq    $0x3f, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        xorq    %rax, %r9
+        movq    %r10, %r11
+        sarq    $0x3f, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        xorq    %rax, %r11
+        movq    %r12, %r13
+        sarq    $0x3f, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        xorq    %rax, %r13
+        movq    %r14, %r15
+        sarq    $0x3f, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        xorq    %rax, %r15
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %r12
+        andq    %r11, %r12
+        addq    %rax, %r12
+        xorl    %r13d, %r13d
+        movq    0x50(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        movq    0x78(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movq    0x58(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        movq    0x80(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    0x88(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    0x68(%rsp), %rax
+        xorq    %r9, %rax
+        andq    %r8, %r9
+        negq    %r9
+        mulq    %r8
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    0x90(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %r9
+        mulq    %r10
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    %r12, 0x50(%rsp)
+        movq    %r13, 0x58(%rsp)
+        movq    %r14, 0x60(%rsp)
+        movq    %r15, 0x68(%rsp)
+        movq    %r9, 0x70(%rsp)
+        movq    $0xe000000000000000, %r8
+        addq    0x50(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x58(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x60(%rsp), %r10
+        movq    $0x2000000000000000, %r11
+        adcq    0x68(%rsp), %r11
+        movq    $0x1fffffffe0000000, %r12
+        adcq    0x70(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movq    $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movq    $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x50(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x58(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x60(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x68(%rsp)
+        movq    0x50(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        movq    0x60(%rsp), %r10
+        movq    0x68(%rsp), %r11
+        movl    $0x1, %eax
+        movl    $0xffffffff, %ebx
+        leaq    -0x2(%rax), %rcx
+        leaq    -0x1(%rbx), %rdx
+        notq    %rbx
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+        cmovaeq %r8, %rax
+        cmovaeq %r9, %rbx
+        cmovaeq %r10, %rcx
+        cmovaeq %r11, %rdx
+        movq    0xe0(%rsp), %rdi
+        movq    %rax, (%rdi)
+        movq    %rbx, 0x8(%rdi)
+        movq    %rcx, 0x10(%rdi)
+        movq    %rdx, 0x18(%rdi)
+        addq    $0xf0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+p256_scalarmul_local_montmul_p256:
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    %rdx, %rcx
+        xorl    %r13d, %r13d
+        movq    (%rcx), %rdx
+        mulxq   (%rsi), %r8, %r9
+        mulxq   0x8(%rsi), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x10(%rsi), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x18(%rsi), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x8(%rcx), %rdx
+        xorl    %r14d, %r14d
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x10(%rcx), %rdx
+        xorl    %r8d, %r8d
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x18(%rsi), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x18(%rcx), %rdx
+        xorl    %r9d, %r9d
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x18(%rsi), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        ret
+
+p256_scalarmul_local_montsqr_p256:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    (%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x8(%rsi), %r9, %r10
+        mulxq   0x18(%rsi), %r11, %r12
+        movq    0x10(%rsi), %rdx
+        mulxq   0x18(%rsi), %r13, %r14
+        xorl    %ebp, %ebp
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x18(%rsi), %rdx
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        adoxq   %rbp, %r14
+        adcq    %rbp, %r14
+        xorl    %ebp, %ebp
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x8(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x10(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x18(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r15
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        movl    %ebp, %r9d
+        adoxq   %rbp, %r9
+        adcxq   %rbp, %r9
+        addq    %r9, %r14
+        adcq    %rbp, %r15
+        movl    %ebp, %r8d
+        adcq    %rbp, %r8
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r8
+        adcq    %rbp, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbp), %rbp
+        movq    %rbp, %rax
+        adcq    %r14, %rbp
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbp, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+p256_scalarmul_local_tomont_p256:
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        xorq    %r13, %r13
+        movl    $0x3, %edx
+        mulxq   (%rsi), %r8, %r9
+        mulxq   0x8(%rsi), %rcx, %r10
+        adcxq   %rcx, %r9
+        mulxq   0x10(%rsi), %rcx, %r11
+        adcxq   %rcx, %r10
+        mulxq   0x18(%rsi), %rcx, %r12
+        adcxq   %rcx, %r11
+        adcxq   %r13, %r12
+        movq    $0xfffffffbffffffff, %rdx
+        xorq    %r14, %r14
+        mulxq   (%rsi), %rax, %rcx
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+        mulxq   0x8(%rsi), %rax, %rcx
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+        mulxq   0x10(%rsi), %rax, %rcx
+        adcxq   %rax, %r11
+        adoxq   %rcx, %r12
+        mulxq   0x18(%rsi), %rax, %rcx
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcq    %r14, %r13
+        xorq    %r15, %r15
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rcx
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+        mulxq   %r9, %rax, %rcx
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rcx
+        adcxq   %rax, %r11
+        adoxq   %rcx, %r12
+        mulxq   %r9, %rax, %rcx
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcxq   %r15, %r14
+        movq    $0xfffffffffffffffe, %rdx
+        xorq    %r8, %r8
+        mulxq   (%rsi), %rax, %rcx
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+        mulxq   0x8(%rsi), %rax, %rcx
+        adcxq   %rax, %r11
+        adoxq   %rcx, %r12
+        mulxq   0x10(%rsi), %rax, %rcx
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        mulxq   0x18(%rsi), %rax, %rcx
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %r8, %r14
+        adoxq   %r8, %r15
+        adcxq   %r8, %r15
+        movq    $0x4fffffffd, %rdx
+        xorq    %r9, %r9
+        mulxq   (%rsi), %rax, %rcx
+        adcxq   %rax, %r11
+        adoxq   %rcx, %r12
+        mulxq   0x8(%rsi), %rax, %rcx
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        mulxq   0x10(%rsi), %rax, %rcx
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        mulxq   0x18(%rsi), %rax, %rcx
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcxq   %r9, %r8
+        xorq    %r9, %r9
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rcx
+        adcxq   %rax, %r11
+        adoxq   %rcx, %r12
+        mulxq   %r11, %rax, %rcx
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rcx
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        mulxq   %r11, %rax, %rcx
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcxq   %r9, %r8
+        movl    $0xffffffff, %edx
+        movq    $0xffffffff00000001, %rcx
+        movq    $0xfffffffffffffffe, %rax
+        subq    %r12, %rax
+        movq    %rdx, %rax
+        sbbq    %r13, %rax
+        movl    $0x0, %eax
+        sbbq    %r14, %rax
+        movq    %rcx, %rax
+        sbbq    %r15, %rax
+        movl    $0x0, %eax
+        sbbq    %r8, %rax
+        andq    %rax, %rdx
+        andq    %rax, %rcx
+        subq    %rax, %r12
+        sbbq    %rdx, %r13
+        sbbq    $0x0, %r14
+        sbbq    %rcx, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        ret
+
+p256_scalarmul_local_p256_montjadd:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xe0, %rsp
+        movq    %rdx, %rbp
+        movq    0x40(%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rsi), %r9, %r10
+        mulxq   0x58(%rsi), %r11, %r12
+        movq    0x50(%rsi), %rdx
+        mulxq   0x58(%rsi), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rsi), %rdx
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %r8d
+        leaq    -0x1(%rdx), %rdx
+        leaq    -0x1(%rcx), %rax
+        movl    $0xfffffffe, %r11d
+        cmoveq  %rcx, %r8
+        cmoveq  %rcx, %rdx
+        cmoveq  %rcx, %rax
+        cmoveq  %rcx, %r11
+        addq    %r8, %r12
+        adcq    %rdx, %r13
+        adcq    %rax, %r14
+        adcq    %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x40(%rbp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rbp), %r9, %r10
+        mulxq   0x58(%rbp), %r11, %r12
+        movq    0x50(%rbp), %rdx
+        mulxq   0x58(%rbp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x40(%rbp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rbp), %rdx
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rbp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rbp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rbp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %r8d
+        leaq    -0x1(%rdx), %rdx
+        leaq    -0x1(%rcx), %rax
+        movl    $0xfffffffe, %r11d
+        cmoveq  %rcx, %r8
+        cmoveq  %rcx, %rdx
+        cmoveq  %rcx, %rax
+        cmoveq  %rcx, %r11
+        addq    %r8, %r12
+        adcq    %rdx, %r13
+        adcq    %rax, %r14
+        adcq    %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rsi), %rdx
+        mulxq   0x40(%rbp), %r8, %r9
+        mulxq   0x48(%rbp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x50(%rbp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x58(%rbp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rsi), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x40(%rbp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x50(%rbp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x58(%rbp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rsi), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x40(%rbp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x50(%rbp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x58(%rbp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rsi), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x40(%rbp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x50(%rbp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x58(%rbp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xc0(%rsp)
+        movq    %r13, 0xc8(%rsp)
+        movq    %r14, 0xd0(%rsp)
+        movq    %r15, 0xd8(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rbp), %rdx
+        mulxq   0x40(%rsi), %r8, %r9
+        mulxq   0x48(%rsi), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x50(%rsi), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x58(%rsi), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rbp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x58(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rbp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x58(%rsi), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rbp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x58(%rsi), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x0(%rbp), %rdx
+        mulxq   (%rsp), %r8, %r9
+        mulxq   0x8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x10(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x18(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x8(%rbp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x10(%rbp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x18(%rbp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        xorl    %r13d, %r13d
+        movq    (%rsi), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0xb0(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0xb8(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x8(%rsi), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x10(%rsi), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x18(%rsi), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rsp), %rdx
+        mulxq   (%rsp), %r8, %r9
+        mulxq   0x8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x10(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x18(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        xorl    %r13d, %r13d
+        movq    0xc0(%rsp), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0xb0(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0xb8(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0xc8(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0xd0(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0xd8(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xc0(%rsp)
+        movq    %r13, 0xc8(%rsp)
+        movq    %r14, 0xd0(%rsp)
+        movq    %r15, 0xd8(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0xa0(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0xa8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0xb0(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0xb8(%rsp)
+        movq    0x20(%rsp), %rax
+        subq    0xc0(%rsp), %rax
+        movq    0x28(%rsp), %rcx
+        sbbq    0xc8(%rsp), %rcx
+        movq    0x30(%rsp), %r8
+        sbbq    0xd0(%rsp), %r8
+        movq    0x38(%rsp), %r9
+        sbbq    0xd8(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x20(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x28(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x30(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x38(%rsp)
+        movq    0xa0(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0xa8(%rsp), %r9, %r10
+        mulxq   0xb8(%rsp), %r11, %r12
+        movq    0xb0(%rsp), %rdx
+        mulxq   0xb8(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0xb8(%rsp), %rdx
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0xa8(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0xb0(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0xb8(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %r8d
+        leaq    -0x1(%rdx), %rdx
+        leaq    -0x1(%rcx), %rax
+        movl    $0xfffffffe, %r11d
+        cmoveq  %rcx, %r8
+        cmoveq  %rcx, %rdx
+        cmoveq  %rcx, %rax
+        cmoveq  %rcx, %r11
+        addq    %r8, %r12
+        adcq    %rdx, %r13
+        adcq    %rax, %r14
+        adcq    %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x20(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x28(%rsp), %r9, %r10
+        mulxq   0x38(%rsp), %r11, %r12
+        movq    0x30(%rsp), %rdx
+        mulxq   0x38(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x38(%rsp), %rdx
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x28(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x30(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x38(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %ebx
+        addq    %r12, %rbx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rcx), %rcx
+        movq    %rcx, %rax
+        adcq    %r14, %rcx
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rbx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rcx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x80(%rsp), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x70(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x78(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x88(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x90(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x98(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x40(%rsp), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x70(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x78(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x48(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x50(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x58(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    (%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x40(%rsi), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0xb0(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0xb8(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x48(%rsi), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x50(%rsi), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x58(%rsi), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    (%rsp), %rax
+        subq    0x40(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x48(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x50(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x58(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        xorl    %r13d, %r13d
+        movq    0xc0(%rsp), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x70(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x78(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0xc8(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0xd0(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0xd8(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x40(%rbp), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0xb0(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0xb8(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x48(%rbp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x50(%rbp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x58(%rbp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x80(%rsp), %rdx
+        mulxq   0x20(%rsp), %r8, %r9
+        mulxq   0x28(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x30(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x38(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x88(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x38(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x90(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x38(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x98(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x38(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    0x60(%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x68(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x70(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x78(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x40(%rsi), %r8
+        movq    0x48(%rsi), %r9
+        movq    0x50(%rsi), %r10
+        movq    0x58(%rsi), %r11
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+        movq    0x40(%rbp), %r12
+        movq    0x48(%rbp), %r13
+        movq    0x50(%rbp), %r14
+        movq    0x58(%rbp), %r15
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+        cmpq    %rax, %rbx
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+        cmoveq  0xa0(%rsp), %r12
+        cmoveq  0xa8(%rsp), %r13
+        cmoveq  0xb0(%rsp), %r14
+        cmoveq  0xb8(%rsp), %r15
+        movq    (%rsp), %rax
+        cmovbq  (%rsi), %rax
+        cmova   0x0(%rbp), %rax
+        movq    0x8(%rsp), %rbx
+        cmovbq  0x8(%rsi), %rbx
+        cmova   0x8(%rbp), %rbx
+        movq    0x10(%rsp), %rcx
+        cmovbq  0x10(%rsi), %rcx
+        cmova   0x10(%rbp), %rcx
+        movq    0x18(%rsp), %rdx
+        cmovbq  0x18(%rsi), %rdx
+        cmova   0x18(%rbp), %rdx
+        movq    0x80(%rsp), %r8
+        cmovbq  0x20(%rsi), %r8
+        cmova   0x20(%rbp), %r8
+        movq    0x88(%rsp), %r9
+        cmovbq  0x28(%rsi), %r9
+        cmova   0x28(%rbp), %r9
+        movq    0x90(%rsp), %r10
+        cmovbq  0x30(%rsi), %r10
+        cmova   0x30(%rbp), %r10
+        movq    0x98(%rsp), %r11
+        cmovbq  0x38(%rsi), %r11
+        cmova   0x38(%rbp), %r11
+        movq    %rax, (%rdi)
+        movq    %rbx, 0x8(%rdi)
+        movq    %rcx, 0x10(%rdi)
+        movq    %rdx, 0x18(%rdi)
+        movq    %r8, 0x20(%rdi)
+        movq    %r9, 0x28(%rdi)
+        movq    %r10, 0x30(%rdi)
+        movq    %r11, 0x38(%rdi)
+        movq    %r12, 0x40(%rdi)
+        movq    %r13, 0x48(%rdi)
+        movq    %r14, 0x50(%rdi)
+        movq    %r15, 0x58(%rdi)
+        addq    $0xe0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+p256_scalarmul_local_p256_montjdouble:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xc0, %rsp
+        movq    0x40(%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rsi), %r9, %r10
+        mulxq   0x58(%rsi), %r11, %r12
+        movq    0x50(%rsi), %rdx
+        mulxq   0x58(%rsi), %r13, %r14
+        xorl    %ebp, %ebp
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rsi), %rdx
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        adoxq   %rbp, %r14
+        adcq    %rbp, %r14
+        xorl    %ebp, %ebp
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r15
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        movl    %ebp, %r9d
+        adoxq   %rbp, %r9
+        adcxq   %rbp, %r9
+        addq    %r9, %r14
+        adcq    %rbp, %r15
+        movl    %ebp, %r8d
+        adcq    %rbp, %r8
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r8
+        adcq    %rbp, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbp), %rbp
+        movq    %rbp, %rax
+        adcq    %r14, %rbp
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbp, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x20(%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x28(%rsi), %r9, %r10
+        mulxq   0x38(%rsi), %r11, %r12
+        movq    0x30(%rsi), %rdx
+        mulxq   0x38(%rsi), %r13, %r14
+        xorl    %ebp, %ebp
+        mulxq   0x20(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x38(%rsi), %rdx
+        mulxq   0x28(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        adoxq   %rbp, %r14
+        adcq    %rbp, %r14
+        xorl    %ebp, %ebp
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x28(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x30(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x38(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r15
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        movl    %ebp, %r9d
+        adoxq   %rbp, %r9
+        adcxq   %rbp, %r9
+        addq    %r9, %r14
+        adcq    %rbp, %r15
+        movl    %ebp, %r8d
+        adcq    %rbp, %r8
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r8
+        adcq    %rbp, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbp), %rbp
+        movq    %rbp, %rax
+        adcq    %r14, %rbp
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbp, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    (%rsi), %rax
+        subq    (%rsp), %rax
+        movq    0x8(%rsi), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x10(%rsi), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x18(%rsi), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        movq    (%rsi), %rax
+        addq    (%rsp), %rax
+        movq    0x8(%rsi), %rcx
+        adcq    0x8(%rsp), %rcx
+        movq    0x10(%rsi), %r8
+        adcq    0x10(%rsp), %r8
+        movq    0x18(%rsi), %r9
+        adcq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        subq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        sbbq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        sbbq    $0x0, %r8
+        movq    %r8, 0x50(%rsp)
+        sbbq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rdx
+        mulxq   0x40(%rsp), %r8, %r9
+        mulxq   0x48(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x50(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x58(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x68(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x40(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x50(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x58(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x70(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x40(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x50(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x58(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x78(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x40(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x50(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x58(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        xorq    %r11, %r11
+        movq    0x20(%rsi), %rax
+        addq    0x40(%rsi), %rax
+        movq    0x28(%rsi), %rcx
+        adcq    0x48(%rsi), %rcx
+        movq    0x30(%rsi), %r8
+        adcq    0x50(%rsi), %r8
+        movq    0x38(%rsi), %r9
+        adcq    0x58(%rsi), %r9
+        adcq    %r11, %r11
+        subq    $0xffffffffffffffff, %rax
+        movl    $0xffffffff, %r10d
+        sbbq    %r10, %rcx
+        sbbq    $0x0, %r8
+        movq    $0xffffffff00000001, %rdx
+        sbbq    %rdx, %r9
+        sbbq    $0x0, %r11
+        andq    %r11, %r10
+        andq    %r11, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x50(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rsp), %rdx
+        mulxq   (%rsi), %r8, %r9
+        mulxq   0x8(%rsi), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x10(%rsi), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x18(%rsi), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x18(%rsi), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x18(%rsi), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x60(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x68(%rsp), %r9, %r10
+        mulxq   0x78(%rsp), %r11, %r12
+        movq    0x70(%rsp), %rdx
+        mulxq   0x78(%rsp), %r13, %r14
+        xorl    %ebp, %ebp
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x78(%rsp), %rdx
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        adoxq   %rbp, %r14
+        adcq    %rbp, %r14
+        xorl    %ebp, %ebp
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x68(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x70(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x78(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r15
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        movl    %ebp, %r9d
+        adoxq   %rbp, %r9
+        adcxq   %rbp, %r9
+        addq    %r9, %r14
+        adcq    %rbp, %r15
+        movl    %ebp, %r8d
+        adcq    %rbp, %r8
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r8
+        adcq    %rbp, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbp), %rbp
+        movq    %rbp, %rax
+        adcq    %r14, %rbp
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbp, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    0x40(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rsp), %r9, %r10
+        mulxq   0x58(%rsp), %r11, %r12
+        movq    0x50(%rsp), %rdx
+        mulxq   0x58(%rsp), %r13, %r14
+        xorl    %ebp, %ebp
+        mulxq   0x40(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rsp), %rdx
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        adoxq   %rbp, %r14
+        adcq    %rbp, %r14
+        xorl    %ebp, %ebp
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r15
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        movl    %ebp, %r9d
+        adoxq   %rbp, %r9
+        adcxq   %rbp, %r9
+        addq    %r9, %r14
+        adcq    %rbp, %r15
+        movl    %ebp, %r8d
+        adcq    %rbp, %r8
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r8
+        adcq    %rbp, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbp), %rbp
+        movq    %rbp, %rax
+        adcq    %r14, %rbp
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbp, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    $0xffffffffffffffff, %r8
+        xorl    %r10d, %r10d
+        subq    0xa0(%rsp), %r8
+        movq    $0xffffffff, %r9
+        sbbq    0xa8(%rsp), %r9
+        sbbq    0xb0(%rsp), %r10
+        movq    $0xffffffff00000001, %r11
+        sbbq    0xb8(%rsp), %r11
+        xorl    %r12d, %r12d
+        movq    $0x9, %rdx
+        mulxq   %r8, %r8, %rax
+        mulxq   %r9, %r9, %rcx
+        addq    %rax, %r9
+        mulxq   %r10, %r10, %rax
+        adcq    %rcx, %r10
+        mulxq   %r11, %r11, %rcx
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        movq    $0xc, %rdx
+        xorl    %eax, %eax
+        mulxq   0x80(%rsp), %rax, %rcx
+        adcxq   %rax, %r8
+        adoxq   %rcx, %r9
+        mulxq   0x88(%rsp), %rax, %rcx
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+        mulxq   0x90(%rsp), %rax, %rcx
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+        mulxq   0x98(%rsp), %rax, %rdx
+        adcxq   %rax, %r11
+        adoxq   %r12, %rdx
+        adcq    $0x1, %rdx
+        addq    %rdx, %r8
+        movq    $0x100000000, %rax
+        mulxq   %rax, %rax, %rcx
+        sbbq    $0x0, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        movq    $0xffffffff00000001, %rax
+        mulxq   %rax, %rax, %rcx
+        sbbq    %rax, %r11
+        sbbq    %rcx, %rdx
+        decq    %rdx
+        movl    $0xffffffff, %eax
+        andq    %rdx, %rax
+        xorl    %ecx, %ecx
+        subq    %rax, %rcx
+        addq    %rdx, %r8
+        movq    %r8, 0xa0(%rsp)
+        adcq    %rax, %r9
+        movq    %r9, 0xa8(%rsp)
+        adcq    $0x0, %r10
+        movq    %r10, 0xb0(%rsp)
+        adcq    %rcx, %r11
+        movq    %r11, 0xb8(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x50(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        movq    0x20(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x28(%rsp), %r9, %r10
+        mulxq   0x38(%rsp), %r11, %r12
+        movq    0x30(%rsp), %rdx
+        mulxq   0x38(%rsp), %r13, %r14
+        xorl    %ebp, %ebp
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x38(%rsp), %rdx
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        adoxq   %rbp, %r14
+        adcq    %rbp, %r14
+        xorl    %ebp, %ebp
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x28(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x30(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x38(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r15
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        movl    %ebp, %r9d
+        adoxq   %rbp, %r9
+        adcxq   %rbp, %r9
+        addq    %r9, %r14
+        adcq    %rbp, %r15
+        movl    %ebp, %r8d
+        adcq    %rbp, %r8
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r8
+        adcq    %rbp, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbp), %rbp
+        movq    %rbp, %rax
+        adcq    %r14, %rbp
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbp, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0xb0(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0xb8(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x68(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x70(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x78(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x20(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x28(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x30(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x38(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rdi)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rdi)
+        adcq    $0x0, %r8
+        movq    %r8, 0x50(%rdi)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rdi)
+        movq    0x98(%rsp), %r11
+        movq    %r11, %rdx
+        movq    0x90(%rsp), %r10
+        shldq   $0x2, %r10, %r11
+        movq    0x88(%rsp), %r9
+        shldq   $0x2, %r9, %r10
+        movq    0x80(%rsp), %r8
+        shldq   $0x2, %r8, %r9
+        shlq    $0x2, %r8
+        shrq    $0x3e, %rdx
+        addq    $0x1, %rdx
+        subq    0xa0(%rsp), %r8
+        sbbq    0xa8(%rsp), %r9
+        sbbq    0xb0(%rsp), %r10
+        sbbq    0xb8(%rsp), %r11
+        sbbq    $0x0, %rdx
+        addq    %rdx, %r8
+        movq    $0x100000000, %rax
+        mulxq   %rax, %rax, %rcx
+        sbbq    $0x0, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        movq    $0xffffffff00000001, %rax
+        mulxq   %rax, %rax, %rcx
+        sbbq    %rax, %r11
+        sbbq    %rcx, %rdx
+        decq    %rdx
+        movl    $0xffffffff, %eax
+        andq    %rdx, %rax
+        xorl    %ecx, %ecx
+        subq    %rax, %rcx
+        addq    %rdx, %r8
+        movq    %r8, (%rdi)
+        adcq    %rax, %r9
+        movq    %r9, 0x8(%rdi)
+        adcq    $0x0, %r10
+        movq    %r10, 0x10(%rdi)
+        adcq    %rcx, %r11
+        movq    %r11, 0x18(%rdi)
+        movq    $0xffffffffffffffff, %r8
+        xorl    %r10d, %r10d
+        subq    (%rsp), %r8
+        movq    $0xffffffff, %r9
+        sbbq    0x8(%rsp), %r9
+        sbbq    0x10(%rsp), %r10
+        movq    $0xffffffff00000001, %r11
+        sbbq    0x18(%rsp), %r11
+        movq    %r11, %r12
+        shldq   $0x3, %r10, %r11
+        shldq   $0x3, %r9, %r10
+        shldq   $0x3, %r8, %r9
+        shlq    $0x3, %r8
+        shrq    $0x3d, %r12
+        movq    $0x3, %rdx
+        xorl    %eax, %eax
+        mulxq   0x60(%rsp), %rax, %rcx
+        adcxq   %rax, %r8
+        adoxq   %rcx, %r9
+        mulxq   0x68(%rsp), %rax, %rcx
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+        mulxq   0x70(%rsp), %rax, %rcx
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+        mulxq   0x78(%rsp), %rax, %rdx
+        adcxq   %rax, %r11
+        adoxq   %r12, %rdx
+        adcq    $0x1, %rdx
+        addq    %rdx, %r8
+        movq    $0x100000000, %rax
+        mulxq   %rax, %rax, %rcx
+        sbbq    $0x0, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        movq    $0xffffffff00000001, %rax
+        mulxq   %rax, %rax, %rcx
+        sbbq    %rax, %r11
+        sbbq    %rcx, %rdx
+        decq    %rdx
+        movl    $0xffffffff, %eax
+        andq    %rdx, %rax
+        xorl    %ecx, %ecx
+        subq    %rax, %rcx
+        addq    %rdx, %r8
+        movq    %r8, 0x20(%rdi)
+        adcq    %rax, %r9
+        movq    %r9, 0x28(%rdi)
+        adcq    $0x0, %r10
+        movq    %r10, 0x30(%rdi)
+        adcq    %rcx, %r11
+        movq    %r11, 0x38(%rdi)
+        addq    $0xc0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+p256_scalarmul_local_p256_montjmixadd:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xc0, %rsp
+        movq    %rdx, %rbp
+        movq    0x40(%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rsi), %r9, %r10
+        mulxq   0x58(%rsi), %r11, %r12
+        movq    0x50(%rsi), %rdx
+        mulxq   0x58(%rsi), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rsi), %rdx
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %r8d
+        leaq    -0x1(%rdx), %rdx
+        leaq    -0x1(%rcx), %rax
+        movl    $0xfffffffe, %r11d
+        cmoveq  %rcx, %r8
+        cmoveq  %rcx, %rdx
+        cmoveq  %rcx, %rax
+        cmoveq  %rcx, %r11
+        addq    %r8, %r12
+        adcq    %rdx, %r13
+        adcq    %rax, %r14
+        adcq    %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rbp), %rdx
+        mulxq   0x40(%rsi), %r8, %r9
+        mulxq   0x48(%rsi), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x50(%rsi), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x58(%rsi), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rbp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x58(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rbp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x58(%rsi), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rbp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x58(%rsi), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x0(%rbp), %rdx
+        mulxq   (%rsp), %r8, %r9
+        mulxq   0x8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x10(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x18(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x8(%rbp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x10(%rbp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x18(%rbp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rsp), %rdx
+        mulxq   (%rsp), %r8, %r9
+        mulxq   0x8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x10(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x18(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    (%rsi), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x8(%rsi), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x10(%rsi), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x18(%rsi), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0xa0(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0xa8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0xb0(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0xb8(%rsp)
+        movq    0x20(%rsp), %rax
+        subq    0x20(%rsi), %rax
+        movq    0x28(%rsp), %rcx
+        sbbq    0x28(%rsi), %rcx
+        movq    0x30(%rsp), %r8
+        sbbq    0x30(%rsi), %r8
+        movq    0x38(%rsp), %r9
+        sbbq    0x38(%rsi), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x20(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x28(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x30(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x38(%rsp)
+        movq    0xa0(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0xa8(%rsp), %r9, %r10
+        mulxq   0xb8(%rsp), %r11, %r12
+        movq    0xb0(%rsp), %rdx
+        mulxq   0xb8(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0xb8(%rsp), %rdx
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0xa8(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0xb0(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0xb8(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %r8d
+        leaq    -0x1(%rdx), %rdx
+        leaq    -0x1(%rcx), %rax
+        movl    $0xfffffffe, %r11d
+        cmoveq  %rcx, %r8
+        cmoveq  %rcx, %rdx
+        cmoveq  %rcx, %rax
+        cmoveq  %rcx, %r11
+        addq    %r8, %r12
+        adcq    %rdx, %r13
+        adcq    %rax, %r14
+        adcq    %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x20(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x28(%rsp), %r9, %r10
+        mulxq   0x38(%rsp), %r11, %r12
+        movq    0x30(%rsp), %rdx
+        mulxq   0x38(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x38(%rsp), %rdx
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x28(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x30(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x38(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %ebx
+        addq    %r12, %rbx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rcx), %rcx
+        movq    %rcx, %rax
+        adcq    %r14, %rcx
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rbx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rcx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        xorl    %r13d, %r13d
+        movq    (%rsi), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x70(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x78(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x8(%rsi), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x10(%rsi), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x18(%rsi), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x40(%rsp), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x70(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x78(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x48(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x50(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x58(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    (%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x40(%rsi), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0xb0(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0xb8(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x48(%rsi), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x50(%rsi), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x58(%rsi), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    (%rsp), %rax
+        subq    0x40(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x48(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x50(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x58(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rsi), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x70(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x78(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rsi), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rsi), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rsi), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x80(%rsp), %rdx
+        mulxq   0x20(%rsp), %r8, %r9
+        mulxq   0x28(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x30(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x38(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x88(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x38(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x90(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x38(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x98(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x38(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    0x60(%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x68(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x70(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x78(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x40(%rsi), %rax
+        movq    0x48(%rsi), %rdx
+        orq     0x50(%rsi), %rax
+        orq     0x58(%rsi), %rdx
+        orq     %rdx, %rax
+        movq    (%rsp), %r8
+        movq    0x0(%rbp), %rax
+        cmoveq  %rax, %r8
+        movq    0x8(%rsp), %r9
+        movq    0x8(%rbp), %rax
+        cmoveq  %rax, %r9
+        movq    0x10(%rsp), %r10
+        movq    0x10(%rbp), %rax
+        cmoveq  %rax, %r10
+        movq    0x18(%rsp), %r11
+        movq    0x18(%rbp), %rax
+        cmoveq  %rax, %r11
+        movq    0x80(%rsp), %r12
+        movq    0x20(%rbp), %rax
+        cmoveq  %rax, %r12
+        movq    0x88(%rsp), %r13
+        movq    0x28(%rbp), %rax
+        cmoveq  %rax, %r13
+        movq    0x90(%rsp), %r14
+        movq    0x30(%rbp), %rax
+        cmoveq  %rax, %r14
+        movq    0x98(%rsp), %r15
+        movq    0x38(%rbp), %rax
+        cmoveq  %rax, %r15
+        movq    %r8, (%rdi)
+        movq    %r9, 0x8(%rdi)
+        movq    %r10, 0x10(%rdi)
+        movq    %r11, 0x18(%rdi)
+        movq    %r12, 0x20(%rdi)
+        movq    %r13, 0x28(%rdi)
+        movq    %r14, 0x30(%rdi)
+        movq    %r15, 0x38(%rdi)
+        movq    0xa0(%rsp), %r8
+        movq    0xa8(%rsp), %r9
+        movq    0xb0(%rsp), %r10
+        movq    0xb8(%rsp), %r11
+        movl    $0x1, %eax
+        cmoveq  %rax, %r8
+        movq    $0xffffffff00000000, %rax
+        cmoveq  %rax, %r9
+        movq    $0xffffffffffffffff, %rax
+        cmoveq  %rax, %r10
+        movl    $0xfffffffe, %eax
+        cmoveq  %rax, %r11
+        movq    %r8, 0x40(%rdi)
+        movq    %r9, 0x48(%rdi)
+        movq    %r10, 0x50(%rdi)
+        movq    %r11, 0x58(%rdi)
+        addq    $0xc0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul_alt.S
new file mode 100644
index 00000000000..f1ec19f4491
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul_alt.S
@@ -0,0 +1,8651 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Scalar multiplication for P-256
+// Input scalar[4], point[8]; output res[8]
+//
+// extern void p256_scalarmul_alt
+//   (uint64_t res[static 8],uint64_t scalar[static 4],
+//     uint64_t point[static 8]);
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve P-256, returns the point (X,Y) = n * P. The input and output
+// are affine points, and in the case of the point at infinity as
+// the result, (0,0) is returned.
+//
+// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point
+// Microsoft x64 ABI:   RCX = res, RDX = scalar, R8 = point
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmul_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmul_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Intermediate variables on the stack. The last z2, z3 values can
+// safely be overlaid on the table, which is no longer needed at the end.
+// Uppercase syntactic variants make x86_att version simpler to generate
+
+#define SCALARB (0*NUMSIZE)
+#define scalarb (0*NUMSIZE)(%rsp)
+#define ACC (1*NUMSIZE)
+#define acc (1*NUMSIZE)(%rsp)
+#define TABENT (4*NUMSIZE)
+#define tabent (4*NUMSIZE)(%rsp)
+
+#define TAB (7*NUMSIZE)
+#define tab (7*NUMSIZE)(%rsp)
+
+#define Z2 (7*NUMSIZE)
+#define z2 (7*NUMSIZE)(%rsp)
+#define Z3 (8*NUMSIZE)
+#define z3 (8*NUMSIZE)(%rsp)
+
+#define res (31*NUMSIZE)(%rsp)
+
+#define NSPACE (32*NUMSIZE)
+
+S2N_BN_SYMBOL(p256_scalarmul_alt):
+        _CET_ENDBR
+
+// The Windows version literally calls the standard ABI version.
+// This simplifies the proofs since subroutine offsets are fixed.
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        callq   p256_scalarmul_alt_standard
+        popq   %rsi
+        popq   %rdi
+        ret
+
+p256_scalarmul_alt_standard:
+#endif
+
+// Real start of the standard ABI code.
+
+        pushq   %r15
+        pushq   %r14
+        pushq   %r13
+        pushq   %r12
+        pushq   %rbp
+        pushq   %rbx
+
+        subq    $NSPACE, %rsp
+
+// Preserve the "res" and "point" input arguments. We load and process the
+// scalar immediately so we don't bother preserving that input argument.
+// Also, "point" is only needed early on and so its register gets re-used.
+
+        movq    %rdx, %rbx
+        movq    %rdi, res
+
+// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12]
+
+        movq    $0xf3b9cac2fc632551, %r12
+        movq    $0xbce6faada7179e84, %r13
+        movq    $0xffffffffffffffff, %r14
+        movq    $0xffffffff00000000, %r15
+
+// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256
+
+        movq    (%rsi), %r8
+        subq    %r12, %r8
+        movq    8(%rsi), %r9
+        sbbq    %r13, %r9
+        movq    16(%rsi), %r10
+        sbbq    %r14, %r10
+        movq    24(%rsi), %r11
+        sbbq    %r15, %r11
+
+        cmovcq  (%rsi), %r8
+        cmovcq  8(%rsi), %r9
+        cmovcq  16(%rsi), %r10
+        cmovcq  24(%rsi), %r11
+
+// Now if the top bit of the reduced scalar is set, negate it mod n_256,
+// i.e. do n |-> n_256 - n. Remember the sign in %rbp so we can
+// correspondingly negate the point below.
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        movq    %r11, %rbp
+        shrq    $63, %rbp
+        cmovnzq %r12, %r8
+        cmovnzq %r13, %r9
+        cmovnzq %r14, %r10
+        cmovnzq %r15, %r11
+
+// In either case then add the recoding constant 0x08888...888 to allow
+// signed digits.
+
+        movq    $0x8888888888888888, %rax
+        addq    %rax, %r8
+        adcq    %rax, %r9
+        adcq    %rax, %r10
+        adcq    %rax, %r11
+        btc     $63, %r11
+
+        movq    %r8, SCALARB(%rsp)
+        movq    %r9, SCALARB+8(%rsp)
+        movq    %r10, SCALARB+16(%rsp)
+        movq    %r11, SCALARB+24(%rsp)
+
+// Set the tab[0] table entry to Montgomery-Jacobian point = 1 * P
+// The z coordinate is just the Montgomery form of the constant 1.
+
+        leaq    TAB(%rsp), %rdi
+        movq    %rbx, %rsi
+        callq   p256_scalarmul_alt_local_tomont_p256
+
+        leaq    32(%rbx), %rsi
+        leaq    TAB+32(%rsp), %rdi
+        callq   p256_scalarmul_alt_local_tomont_p256
+
+        movl    $1, %eax
+        movq    %rax, TAB+64(%rsp)
+        movq    $0xffffffff00000000, %rdx
+        movq    %rdx, TAB+72(%rsp)
+        subq    $2, %rax
+        movq    %rax, TAB+80(%rsp)
+        movq    $0x00000000fffffffe, %rax
+        movq    %rax, TAB+88(%rsp)
+
+// If the top bit of the scalar was set, negate (y coordinate of) the point
+
+        movq    TAB+32(%rsp), %r12
+        movq    TAB+40(%rsp), %r13
+        movq    TAB+48(%rsp), %r14
+        movq    TAB+56(%rsp), %r15
+
+        xorl    %r10d, %r10d
+        leaq    -1(%r10), %r8
+        movq    $0x00000000ffffffff, %r11
+        movq    %r11, %r9
+        negq    %r11
+
+        subq    %r12, %r8
+        sbbq    %r13, %r9
+        sbbq    %r14, %r10
+        sbbq    %r15, %r11
+
+        testq   %rbp, %rbp
+        cmovzq  %r12, %r8
+        cmovzq  %r13, %r9
+        cmovzq  %r14, %r10
+        cmovzq  %r15, %r11
+
+        movq    %r8, TAB+32(%rsp)
+        movq    %r9, TAB+40(%rsp)
+        movq    %r10, TAB+48(%rsp)
+        movq    %r11, TAB+56(%rsp)
+
+// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P
+
+        leaq    TAB+96*1(%rsp), %rdi
+        leaq    TAB(%rsp), %rsi
+        callq   p256_scalarmul_alt_local_p256_montjdouble
+
+        leaq    TAB+96*2(%rsp), %rdi
+        leaq    TAB+96*1(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   p256_scalarmul_alt_local_p256_montjmixadd
+
+        leaq    TAB+96*3(%rsp), %rdi
+        leaq    TAB+96*1(%rsp), %rsi
+        callq   p256_scalarmul_alt_local_p256_montjdouble
+
+        leaq    TAB+96*4(%rsp), %rdi
+        leaq    TAB+96*3(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   p256_scalarmul_alt_local_p256_montjmixadd
+
+        leaq    TAB+96*5(%rsp), %rdi
+        leaq    TAB+96*2(%rsp), %rsi
+        callq   p256_scalarmul_alt_local_p256_montjdouble
+
+        leaq    TAB+96*6(%rsp), %rdi
+        leaq    TAB+96*5(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   p256_scalarmul_alt_local_p256_montjmixadd
+
+        leaq    TAB+96*7(%rsp), %rdi
+        leaq    TAB+96*3(%rsp), %rsi
+        callq   p256_scalarmul_alt_local_p256_montjdouble
+
+// Set up accumulator as table entry for top 4 bits (constant-time indexing)
+
+        movq    SCALARB+24(%rsp), %rdi
+        shrq    $60, %rdi
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        xorl    %r8d, %r8d
+        xorl    %r9d, %r9d
+        xorl    %r10d, %r10d
+        xorl    %r11d, %r11d
+        xorl    %r12d, %r12d
+        xorl    %r13d, %r13d
+        xorl    %r14d, %r14d
+        xorl    %r15d, %r15d
+
+        .set I, 1
+.rep 8
+        cmpq    $I, %rdi
+
+        cmovzq  TAB+96*(I-1)(%rsp), %rax
+        cmovzq  TAB+96*(I-1)+8(%rsp), %rbx
+        cmovzq  TAB+96*(I-1)+16(%rsp), %rcx
+        cmovzq  TAB+96*(I-1)+24(%rsp), %rdx
+        cmovzq  TAB+96*(I-1)+32(%rsp), %r8
+        cmovzq  TAB+96*(I-1)+40(%rsp), %r9
+        cmovzq  TAB+96*(I-1)+48(%rsp), %r10
+        cmovzq  TAB+96*(I-1)+56(%rsp), %r11
+        cmovzq  TAB+96*(I-1)+64(%rsp), %r12
+        cmovzq  TAB+96*(I-1)+72(%rsp), %r13
+        cmovzq  TAB+96*(I-1)+80(%rsp), %r14
+        cmovzq  TAB+96*(I-1)+88(%rsp), %r15
+        .set    I, (I+1)
+.endr
+        movq     %rax, ACC(%rsp)
+        movq     %rbx, ACC+8(%rsp)
+        movq     %rcx, ACC+16(%rsp)
+        movq     %rdx, ACC+24(%rsp)
+        movq     %r8, ACC+32(%rsp)
+        movq     %r9, ACC+40(%rsp)
+        movq     %r10, ACC+48(%rsp)
+        movq     %r11, ACC+56(%rsp)
+        movq     %r12, ACC+64(%rsp)
+        movq     %r13, ACC+72(%rsp)
+        movq     %r14, ACC+80(%rsp)
+        movq     %r15, ACC+88(%rsp)
+
+// Main loop over size-4 bitfield
+
+        movl    $252, %ebp
+
+p256_scalarmul_alt_loop:
+        subq    $4, %rbp
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_alt_local_p256_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_alt_local_p256_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_alt_local_p256_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_alt_local_p256_montjdouble
+
+        movq    %rbp, %rax
+        shrq    $6, %rax
+        movq    (%rsp,%rax,8), %rdi
+        movq    %rbp, %rcx
+        shrq    %cl, %rdi
+        andq    $15, %rdi
+
+        subq    $8, %rdi
+        sbbq    %rsi, %rsi // %rsi = sign of digit (-1 = negative)
+        xorq    %rsi, %rdi
+        subq    %rsi, %rdi // %rdi = absolute value of digit
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        xorl    %r8d, %r8d
+        xorl    %r9d, %r9d
+        xorl    %r10d, %r10d
+        xorl    %r11d, %r11d
+        xorl    %r12d, %r12d
+        xorl    %r13d, %r13d
+        xorl    %r14d, %r14d
+        xorl    %r15d, %r15d
+
+        .set I, 1
+.rep 8
+        cmpq    $I, %rdi
+
+        cmovzq  TAB+96*(I-1)(%rsp), %rax
+        cmovzq  TAB+96*(I-1)+8(%rsp), %rbx
+        cmovzq  TAB+96*(I-1)+16(%rsp), %rcx
+        cmovzq  TAB+96*(I-1)+24(%rsp), %rdx
+        cmovzq  TAB+96*(I-1)+32(%rsp), %r8
+        cmovzq  TAB+96*(I-1)+40(%rsp), %r9
+        cmovzq  TAB+96*(I-1)+48(%rsp), %r10
+        cmovzq  TAB+96*(I-1)+56(%rsp), %r11
+        cmovzq  TAB+96*(I-1)+64(%rsp), %r12
+        cmovzq  TAB+96*(I-1)+72(%rsp), %r13
+        cmovzq  TAB+96*(I-1)+80(%rsp), %r14
+        cmovzq  TAB+96*(I-1)+88(%rsp), %r15
+        .set    I, (I+1)
+.endr
+
+        movq     %r12, TABENT+64(%rsp)
+        movq     %r13, TABENT+72(%rsp)
+        movq     %r14, TABENT+80(%rsp)
+        movq     %r15, TABENT+88(%rsp)
+
+        xorl    %r14d, %r14d
+        leaq    -1(%r14), %r12
+        movq    $0x00000000ffffffff, %r15
+        movq    %r15, %r13
+        negq    %r15
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        testq    %rsi, %rsi
+        cmovnzq  %r12, %r8
+        cmovnzq  %r13, %r9
+        cmovnzq  %r14, %r10
+        cmovnzq  %r15, %r11
+
+        movq     %rax, TABENT(%rsp)
+        movq     %rbx, TABENT+8(%rsp)
+        movq     %rcx, TABENT+16(%rsp)
+        movq     %rdx, TABENT+24(%rsp)
+
+        movq     %r8, TABENT+32(%rsp)
+        movq     %r9, TABENT+40(%rsp)
+        movq     %r10, TABENT+48(%rsp)
+        movq     %r11, TABENT+56(%rsp)
+
+        leaq    TABENT(%rsp), %rdx
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   p256_scalarmul_alt_local_p256_montjadd
+
+        testq   %rbp, %rbp
+        jne     p256_scalarmul_alt_loop
+
+// Let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form
+
+        leaq    Z2(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        callq   p256_scalarmul_alt_local_montsqr_p256
+
+        leaq    Z3(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        leaq    Z2(%rsp), %rdx
+        callq   p256_scalarmul_alt_local_montmul_p256
+
+        leaq    Z2(%rsp), %rdi
+        leaq    Z3(%rsp), %rsi
+        callq   p256_scalarmul_alt_local_demont_p256
+
+        leaq    Z3(%rsp), %rdi
+        leaq    Z2(%rsp), %rsi
+        callq   p256_scalarmul_alt_local_inv_p256
+
+        leaq    Z2(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        leaq    Z3(%rsp), %rdx
+        callq   p256_scalarmul_alt_local_montmul_p256
+
+// Convert back from Jacobian (X, Y, Z) |-> (X/Z^2, Y/Z^3)
+
+        movq    res, %rdi
+        leaq    ACC(%rsp), %rsi
+        leaq    Z2(%rsp), %rdx
+        movq    %rdi, %rbx
+        callq   p256_scalarmul_alt_local_montmul_p256
+
+        leaq    32(%rbx), %rdi
+        leaq    ACC+32(%rsp), %rsi
+        leaq    Z3(%rsp), %rdx
+        callq   p256_scalarmul_alt_local_montmul_p256
+
+// Restore stack and registers and return
+
+        addq    $NSPACE, %rsp
+        popq    %rbx
+        popq    %rbp
+        popq    %r12
+        popq    %r13
+        popq    %r14
+        popq    %r15
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+p256_scalarmul_alt_local_demont_p256:
+        movq    (%rsi), %r8
+        movq    0x8(%rsi), %r9
+        movq    0x10(%rsi), %r10
+        movq    0x18(%rsi), %r11
+        movabsq $0x100000000, %rcx
+        movq    %r8, %rax
+        mulq    %rcx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rsi, %rsi
+        movq    %r9, %rax
+        mulq    %rcx
+        subq    %rsi, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rsi, %rsi
+        negq    %rcx
+        negq    %rsi
+        incq    %rcx
+        movq    %r8, %rax
+        mulq    %rcx
+        addq    %rax, %r11
+        adcq    %rdx, %rsi
+        sbbq    %r8, %r8
+        negq    %r8
+        movq    %r9, %rax
+        mulq    %rcx
+        addq    %rax, %rsi
+        adcq    %rdx, %r8
+        negq    %rcx
+        incq    %rcx
+        movq    %r10, %rax
+        mulq    %rcx
+        addq    %rax, %r11
+        adcq    %rdx, %rsi
+        sbbq    %r9, %r9
+        movq    %r11, %rax
+        mulq    %rcx
+        subq    %r9, %rdx
+        addq    %rax, %rsi
+        adcq    %rdx, %r8
+        sbbq    %r9, %r9
+        negq    %rcx
+        negq    %r9
+        incq    %rcx
+        movq    %r10, %rax
+        mulq    %rcx
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %r10, %r10
+        negq    %r10
+        movq    %r11, %rax
+        mulq    %rcx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    %rsi, (%rdi)
+        movq    %r8, 0x8(%rdi)
+        movq    %r9, 0x10(%rdi)
+        movq    %r10, 0x18(%rdi)
+        ret
+
+p256_scalarmul_alt_local_inv_p256:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xf0, %rsp
+        movq    %rdi, 0xe0(%rsp)
+        xorl    %ecx, %ecx
+        movl    $0xffffffff, %edx
+        movq    %rdx, %rbx
+        leaq    -0x1(%rcx), %rax
+        negq    %rdx
+        movq    %rax, (%rsp)
+        movq    %rbx, 0x8(%rsp)
+        movq    %rcx, 0x10(%rsp)
+        movq    %rdx, 0x18(%rsp)
+        movq    %rcx, 0x20(%rsp)
+        movq    (%rsi), %r8
+        movq    0x8(%rsi), %r9
+        movq    0x10(%rsi), %r10
+        movq    0x18(%rsi), %r11
+        leaq    0x1(%rcx), %rax
+        addq    %r8, %rax
+        leaq    -0x1(%rdx), %rbx
+        adcq    %r9, %rbx
+        notq    %rcx
+        adcq    %r10, %rcx
+        notq    %rdx
+        adcq    %r11, %rdx
+        cmovaeq %r8, %rax
+        cmovaeq %r9, %rbx
+        cmovaeq %r10, %rcx
+        cmovaeq %r11, %rdx
+        movq    %rax, 0x28(%rsp)
+        movq    %rbx, 0x30(%rsp)
+        movq    %rcx, 0x38(%rsp)
+        movq    %rdx, 0x40(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, 0x48(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, 0x50(%rsp)
+        movq    %rax, 0x58(%rsp)
+        movq    %rax, 0x60(%rsp)
+        movq    %rax, 0x68(%rsp)
+        movabsq $0x4000000000000, %rcx
+        movq    %rcx, 0x78(%rsp)
+        movq    %rax, 0x80(%rsp)
+        movq    %rax, 0x88(%rsp)
+        movq    %rax, 0x90(%rsp)
+        movq    $0xa,  0xb0(%rsp)
+        movq    $0x1,  0xb8(%rsp)
+        jmp     p256_scalarmul_alt_inv_midloop
+p256_scalarmul_alt_inv_loop:
+        movq    %r8, %r9
+        sarq    $0x3f, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        movq    %r10, %r11
+        sarq    $0x3f, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        movq    %r12, %r13
+        sarq    $0x3f, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        movq    %r14, %r15
+        sarq    $0x3f, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %rdi
+        andq    %r11, %rdi
+        addq    %rax, %rdi
+        movq    %rdi, 0xa0(%rsp)
+        movq    %r12, %rax
+        andq    %r13, %rax
+        movq    %r14, %rsi
+        andq    %r15, %rsi
+        addq    %rax, %rsi
+        movq    %rsi, 0xa8(%rsp)
+        xorl    %ebx, %ebx
+        movq    (%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    0x28(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        xorl    %ebp, %ebp
+        movq    (%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x28(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        xorl    %ecx, %ecx
+        movq    0x8(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x30(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        shrdq   $0x3b, %rbx, %rdi
+        movq    %rdi, (%rsp)
+        xorl    %edi, %edi
+        movq    0x8(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        movq    0x30(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        shrdq   $0x3b, %rbp, %rsi
+        movq    %rsi, 0x28(%rsp)
+        xorl    %esi, %esi
+        movq    0x10(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        movq    0x38(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        shrdq   $0x3b, %rcx, %rbx
+        movq    %rbx, 0x8(%rsp)
+        xorl    %ebx, %ebx
+        movq    0x10(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    0x38(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        shrdq   $0x3b, %rdi, %rbp
+        movq    %rbp, 0x30(%rsp)
+        movq    0x18(%rsp), %rax
+        xorq    %r9, %rax
+        movq    0x20(%rsp), %rbp
+        xorq    %r9, %rbp
+        andq    %r8, %rbp
+        negq    %rbp
+        mulq    %r8
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x40(%rsp), %rax
+        xorq    %r11, %rax
+        movq    0x48(%rsp), %rdx
+        xorq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbp
+        mulq    %r10
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        shrdq   $0x3b, %rsi, %rcx
+        movq    %rcx, 0x10(%rsp)
+        shrdq   $0x3b, %rbp, %rsi
+        sarq    $0x3b, %rbp
+        movq    0x18(%rsp), %rax
+        movq    %rsi, 0x18(%rsp)
+        movq    0x20(%rsp), %rsi
+        movq    %rbp, 0x20(%rsp)
+        xorq    %r13, %rax
+        xorq    %r13, %rsi
+        andq    %r12, %rsi
+        negq    %rsi
+        mulq    %r12
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        movq    0x40(%rsp), %rax
+        xorq    %r15, %rax
+        movq    0x48(%rsp), %rdx
+        xorq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rsi
+        mulq    %r14
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        shrdq   $0x3b, %rbx, %rdi
+        movq    %rdi, 0x38(%rsp)
+        shrdq   $0x3b, %rsi, %rbx
+        movq    %rbx, 0x40(%rsp)
+        sarq    $0x3b, %rsi
+        movq    %rsi, 0x48(%rsp)
+        movq    0xa0(%rsp), %rbx
+        movq    0xa8(%rsp), %rbp
+        xorl    %ecx, %ecx
+        movq    0x50(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x78(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        xorl    %esi, %esi
+        movq    0x50(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, 0x50(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    0x78(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, 0x78(%rsp)
+        xorl    %ebx, %ebx
+        movq    0x58(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    0x80(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        xorl    %ebp, %ebp
+        movq    0x58(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rcx, 0x58(%rsp)
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x80(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    %rsi, 0x80(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x60(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x88(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        xorl    %esi, %esi
+        movq    0x60(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, 0x60(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    0x88(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, 0x88(%rsp)
+        movq    0x68(%rsp), %rax
+        xorq    %r9, %rax
+        movq    %r9, %rbx
+        andq    %r8, %rbx
+        negq    %rbx
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    0x90(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbx
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rbx, %rdx
+        movq    0x68(%rsp), %rax
+        movq    %rcx, 0x68(%rsp)
+        movq    %rdx, 0x70(%rsp)
+        xorq    %r13, %rax
+        movq    %r13, %rcx
+        andq    %r12, %rcx
+        negq    %rcx
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rcx
+        movq    0x90(%rsp), %rax
+        xorq    %r15, %rax
+        movq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rcx
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rcx, %rdx
+        movq    %rsi, 0x90(%rsp)
+        movq    %rdx, 0x98(%rsp)
+        movabsq $0xe000000000000000, %r8
+        addq    0x50(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x58(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x60(%rsp), %r10
+        movabsq $0x2000000000000000, %r11
+        adcq    0x68(%rsp), %r11
+        movabsq $0x1fffffffe0000000, %r12
+        adcq    0x70(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movabsq $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movabsq $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x50(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x58(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x60(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x68(%rsp)
+        movabsq $0xe000000000000000, %r8
+        addq    0x78(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x80(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x88(%rsp), %r10
+        movabsq $0x2000000000000000, %r11
+        adcq    0x90(%rsp), %r11
+        movabsq $0x1fffffffe0000000, %r12
+        adcq    0x98(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movabsq $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movabsq $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x78(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x80(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x88(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x90(%rsp)
+p256_scalarmul_alt_inv_midloop:
+        movq    0xb8(%rsp), %rsi
+        movq    (%rsp), %rdx
+        movq    0x28(%rsp), %rcx
+        movq    %rdx, %rbx
+        andq    $0xfffff, %rbx
+        movabsq $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        andq    $0xfffff, %rcx
+        movabsq $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    $0xfffffffffffffffe, %rax
+        xorl    %ebp, %ebp
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %rdx
+        leaq    (%rcx,%rax), %rdi
+        shlq    $0x16, %rdx
+        shlq    $0x16, %rdi
+        sarq    $0x2b, %rdx
+        sarq    $0x2b, %rdi
+        movabsq $0x20000100000, %rax
+        leaq    (%rbx,%rax), %rbx
+        leaq    (%rcx,%rax), %rcx
+        sarq    $0x2a, %rbx
+        sarq    $0x2a, %rcx
+        movq    %rdx, 0xc0(%rsp)
+        movq    %rbx, 0xc8(%rsp)
+        movq    %rdi, 0xd0(%rsp)
+        movq    %rcx, 0xd8(%rsp)
+        movq    (%rsp), %r12
+        imulq   %r12, %rdi
+        imulq   %rdx, %r12
+        movq    0x28(%rsp), %r13
+        imulq   %r13, %rbx
+        imulq   %rcx, %r13
+        addq    %rbx, %r12
+        addq    %rdi, %r13
+        sarq    $0x14, %r12
+        sarq    $0x14, %r13
+        movq    %r12, %rbx
+        andq    $0xfffff, %rbx
+        movabsq $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        movq    %r13, %rcx
+        andq    $0xfffff, %rcx
+        movabsq $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    $0xfffffffffffffffe, %rax
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %r8
+        leaq    (%rcx,%rax), %r10
+        shlq    $0x16, %r8
+        shlq    $0x16, %r10
+        sarq    $0x2b, %r8
+        sarq    $0x2b, %r10
+        movabsq $0x20000100000, %rax
+        leaq    (%rbx,%rax), %r15
+        leaq    (%rcx,%rax), %r11
+        sarq    $0x2a, %r15
+        sarq    $0x2a, %r11
+        movq    %r13, %rbx
+        movq    %r12, %rcx
+        imulq   %r8, %r12
+        imulq   %r15, %rbx
+        addq    %rbx, %r12
+        imulq   %r11, %r13
+        imulq   %r10, %rcx
+        addq    %rcx, %r13
+        sarq    $0x14, %r12
+        sarq    $0x14, %r13
+        movq    %r12, %rbx
+        andq    $0xfffff, %rbx
+        movabsq $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        movq    %r13, %rcx
+        andq    $0xfffff, %rcx
+        movabsq $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    0xc0(%rsp), %rax
+        imulq   %r8, %rax
+        movq    0xd0(%rsp), %rdx
+        imulq   %r15, %rdx
+        imulq   0xc8(%rsp), %r8
+        imulq   0xd8(%rsp), %r15
+        addq    %r8, %r15
+        leaq    (%rax,%rdx), %r9
+        movq    0xc0(%rsp), %rax
+        imulq   %r10, %rax
+        movq    0xd0(%rsp), %rdx
+        imulq   %r11, %rdx
+        imulq   0xc8(%rsp), %r10
+        imulq   0xd8(%rsp), %r11
+        addq    %r10, %r11
+        leaq    (%rax,%rdx), %r13
+        movq    $0xfffffffffffffffe, %rax
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %r8
+        leaq    (%rcx,%rax), %r12
+        shlq    $0x15, %r8
+        shlq    $0x15, %r12
+        sarq    $0x2b, %r8
+        sarq    $0x2b, %r12
+        movabsq $0x20000100000, %rax
+        leaq    (%rbx,%rax), %r10
+        leaq    (%rcx,%rax), %r14
+        sarq    $0x2b, %r10
+        sarq    $0x2b, %r14
+        movq    %r9, %rax
+        imulq   %r8, %rax
+        movq    %r13, %rdx
+        imulq   %r10, %rdx
+        imulq   %r15, %r8
+        imulq   %r11, %r10
+        addq    %r8, %r10
+        leaq    (%rax,%rdx), %r8
+        movq    %r9, %rax
+        imulq   %r12, %rax
+        movq    %r13, %rdx
+        imulq   %r14, %rdx
+        imulq   %r15, %r12
+        imulq   %r11, %r14
+        addq    %r12, %r14
+        leaq    (%rax,%rdx), %r12
+        movq    %rsi, 0xb8(%rsp)
+        decq     0xb0(%rsp)
+        jne     p256_scalarmul_alt_inv_loop
+        movq    (%rsp), %rax
+        movq    0x28(%rsp), %rcx
+        imulq   %r8, %rax
+        imulq   %r10, %rcx
+        addq    %rcx, %rax
+        sarq    $0x3f, %rax
+        movq    %r8, %r9
+        sarq    $0x3f, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        xorq    %rax, %r9
+        movq    %r10, %r11
+        sarq    $0x3f, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        xorq    %rax, %r11
+        movq    %r12, %r13
+        sarq    $0x3f, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        xorq    %rax, %r13
+        movq    %r14, %r15
+        sarq    $0x3f, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        xorq    %rax, %r15
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %r12
+        andq    %r11, %r12
+        addq    %rax, %r12
+        xorl    %r13d, %r13d
+        movq    0x50(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        movq    0x78(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movq    0x58(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        movq    0x80(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    0x88(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    0x68(%rsp), %rax
+        xorq    %r9, %rax
+        andq    %r8, %r9
+        negq    %r9
+        mulq    %r8
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    0x90(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %r9
+        mulq    %r10
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    %r12, 0x50(%rsp)
+        movq    %r13, 0x58(%rsp)
+        movq    %r14, 0x60(%rsp)
+        movq    %r15, 0x68(%rsp)
+        movq    %r9, 0x70(%rsp)
+        movabsq $0xe000000000000000, %r8
+        addq    0x50(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x58(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x60(%rsp), %r10
+        movabsq $0x2000000000000000, %r11
+        adcq    0x68(%rsp), %r11
+        movabsq $0x1fffffffe0000000, %r12
+        adcq    0x70(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movabsq $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movabsq $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x50(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x58(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x60(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x68(%rsp)
+        movq    0x50(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        movq    0x60(%rsp), %r10
+        movq    0x68(%rsp), %r11
+        movl    $0x1, %eax
+        movl    $0xffffffff, %ebx
+        leaq    -0x2(%rax), %rcx
+        leaq    -0x1(%rbx), %rdx
+        notq    %rbx
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+        cmovaeq %r8, %rax
+        cmovaeq %r9, %rbx
+        cmovaeq %r10, %rcx
+        cmovaeq %r11, %rdx
+        movq    0xe0(%rsp), %rdi
+        movq    %rax, (%rdi)
+        movq    %rbx, 0x8(%rdi)
+        movq    %rcx, 0x10(%rdi)
+        movq    %rdx, 0x18(%rdi)
+        addq    $0xf0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+p256_scalarmul_alt_local_montmul_p256:
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    %rdx, %rcx
+        movq    (%rcx), %rbx
+        movq    (%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x8(%rcx), %rbx
+        xorl    %r13d, %r13d
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rcx), %rbx
+        xorl    %r15d, %r15d
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rcx), %rbx
+        xorl    %r8d, %r8d
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        ret
+
+p256_scalarmul_alt_local_montsqr_p256:
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    (%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x18(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x10(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x18(%rsi), %rbx
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x8(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x10(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x18(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        ret
+
+
+p256_scalarmul_alt_local_tomont_p256:
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movl    $0x3, %ecx
+        movq    (%rsi), %rax
+        mulq    %rcx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsi), %rax
+        mulq    %rcx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsi), %rax
+        mulq    %rcx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsi), %rax
+        mulq    %rcx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movabsq $0xfffffffbffffffff, %rcx
+        xorl    %r13d, %r13d
+        movq    (%rsi), %rax
+        mulq    %rcx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsi), %rax
+        mulq    %rcx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsi), %rax
+        mulq    %rcx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsi), %rax
+        mulq    %rcx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rcx
+        movq    %r8, %rax
+        mulq    %rcx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rcx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rcx
+        leaq    0x2(%rcx), %rcx
+        movq    %r8, %rax
+        mulq    %rcx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rcx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    $0xfffffffffffffffe, %rcx
+        xorl    %r15d, %r15d
+        movq    (%rsi), %rax
+        mulq    %rcx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsi), %rax
+        mulq    %rcx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsi), %rax
+        mulq    %rcx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsi), %rax
+        mulq    %rcx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movabsq $0x4fffffffd, %rcx
+        xorl    %r8d, %r8d
+        movq    (%rsi), %rax
+        mulq    %rcx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsi), %rax
+        mulq    %rcx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsi), %rax
+        mulq    %rcx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsi), %rax
+        mulq    %rcx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rcx
+        movq    %r10, %rax
+        mulq    %rcx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    %r11, %rax
+        mulq    %rcx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        notq    %rcx
+        leaq    0x2(%rcx), %rcx
+        movq    %r10, %rax
+        mulq    %rcx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    %r11, %rax
+        mulq    %rcx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        sbbq    %r9, %r9
+        subq    %r9, %r8
+        xorl    %edx, %edx
+        leaq    -0x1(%rdx), %r9
+        incq    %rdx
+        addq    %r12, %rdx
+        decq    %rcx
+        adcq    %r13, %rcx
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rdx, %r12
+        cmovbq  %rcx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        ret
+
+p256_scalarmul_alt_local_p256_montjadd:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xe0, %rsp
+        movq    %rdx, %rbp
+        movq    0x40(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rsi), %rbx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x40(%rbp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rbp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rbp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rbp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rbp), %rbx
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rbp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rbp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rbp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    0x20(%rsi), %rbx
+        movq    0x40(%rbp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x50(%rbp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x58(%rbp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rsi), %rbx
+        xorl    %r13d, %r13d
+        movq    0x40(%rbp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x50(%rbp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x58(%rbp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rsi), %rbx
+        xorl    %r15d, %r15d
+        movq    0x40(%rbp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x50(%rbp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x58(%rbp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rsi), %rbx
+        xorl    %r8d, %r8d
+        movq    0x40(%rbp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x50(%rbp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x58(%rbp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xc0(%rsp)
+        movq    %r13, 0xc8(%rsp)
+        movq    %r14, 0xd0(%rsp)
+        movq    %r15, 0xd8(%rsp)
+        movq    0x20(%rbp), %rbx
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rbp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rbp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rbp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    0x0(%rbp), %rbx
+        movq    (%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x8(%rbp), %rbx
+        xorl    %r13d, %r13d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rbp), %rbx
+        xorl    %r15d, %r15d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rbp), %rbx
+        xorl    %r8d, %r8d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    (%rsi), %rbx
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x8(%rsi), %rbx
+        xorl    %r13d, %r13d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rsi), %rbx
+        xorl    %r15d, %r15d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rsi), %rbx
+        xorl    %r8d, %r8d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x20(%rsp), %rbx
+        movq    (%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    0xc0(%rsp), %rbx
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0xc8(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0xd0(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0xd8(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xc0(%rsp)
+        movq    %r13, 0xc8(%rsp)
+        movq    %r14, 0xd0(%rsp)
+        movq    %r15, 0xd8(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0xa0(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0xa8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0xb0(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0xb8(%rsp)
+        movq    0x20(%rsp), %rax
+        subq    0xc0(%rsp), %rax
+        movq    0x28(%rsp), %rcx
+        sbbq    0xc8(%rsp), %rcx
+        movq    0x30(%rsp), %r8
+        sbbq    0xd0(%rsp), %r8
+        movq    0x38(%rsp), %r9
+        sbbq    0xd8(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x20(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x28(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x30(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x38(%rsp)
+        movq    0xa0(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0xb8(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0xb0(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0xb8(%rsp), %rbx
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0xa8(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0xb0(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0xb8(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x20(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x38(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x30(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x38(%rsp), %rbx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x30(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x38(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x80(%rsp), %rbx
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x88(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x90(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x98(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x40(%rsp), %rbx
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x48(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    (%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        movq    0x40(%rsi), %rbx
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x48(%rsi), %rbx
+        xorl    %r13d, %r13d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rsi), %rbx
+        xorl    %r15d, %r15d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rsi), %rbx
+        xorl    %r8d, %r8d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    (%rsp), %rax
+        subq    0x40(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x48(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x50(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x58(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0xc0(%rsp), %rbx
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0xc8(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0xd0(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0xd8(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x40(%rbp), %rbx
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x48(%rbp), %rbx
+        xorl    %r13d, %r13d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rbp), %rbx
+        xorl    %r15d, %r15d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rbp), %rbx
+        xorl    %r8d, %r8d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    0x80(%rsp), %rbx
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x88(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x90(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x98(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    0x60(%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x68(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x70(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x78(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x40(%rsi), %r8
+        movq    0x48(%rsi), %r9
+        movq    0x50(%rsi), %r10
+        movq    0x58(%rsi), %r11
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+        movq    0x40(%rbp), %r12
+        movq    0x48(%rbp), %r13
+        movq    0x50(%rbp), %r14
+        movq    0x58(%rbp), %r15
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+        cmpq    %rax, %rbx
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+        cmoveq  0xa0(%rsp), %r12
+        cmoveq  0xa8(%rsp), %r13
+        cmoveq  0xb0(%rsp), %r14
+        cmoveq  0xb8(%rsp), %r15
+        movq    (%rsp), %rax
+        cmovbq  (%rsi), %rax
+        cmova   0x0(%rbp), %rax
+        movq    0x8(%rsp), %rbx
+        cmovbq  0x8(%rsi), %rbx
+        cmova   0x8(%rbp), %rbx
+        movq    0x10(%rsp), %rcx
+        cmovbq  0x10(%rsi), %rcx
+        cmova   0x10(%rbp), %rcx
+        movq    0x18(%rsp), %rdx
+        cmovbq  0x18(%rsi), %rdx
+        cmova   0x18(%rbp), %rdx
+        movq    0x80(%rsp), %r8
+        cmovbq  0x20(%rsi), %r8
+        cmova   0x20(%rbp), %r8
+        movq    0x88(%rsp), %r9
+        cmovbq  0x28(%rsi), %r9
+        cmova   0x28(%rbp), %r9
+        movq    0x90(%rsp), %r10
+        cmovbq  0x30(%rsi), %r10
+        cmova   0x30(%rbp), %r10
+        movq    0x98(%rsp), %r11
+        cmovbq  0x38(%rsi), %r11
+        cmova   0x38(%rbp), %r11
+        movq    %rax, (%rdi)
+        movq    %rbx, 0x8(%rdi)
+        movq    %rcx, 0x10(%rdi)
+        movq    %rdx, 0x18(%rdi)
+        movq    %r8, 0x20(%rdi)
+        movq    %r9, 0x28(%rdi)
+        movq    %r10, 0x30(%rdi)
+        movq    %r11, 0x38(%rdi)
+        movq    %r12, 0x40(%rdi)
+        movq    %r13, 0x48(%rdi)
+        movq    %r14, 0x50(%rdi)
+        movq    %r15, 0x58(%rdi)
+        addq    $0xe0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+p256_scalarmul_alt_local_p256_montjdouble:
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xc0, %rsp
+        movq    0x40(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rsi), %rbx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x20(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x28(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x38(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x30(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x20(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x28(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x38(%rsi), %rbx
+        movq    0x28(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x28(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x30(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x38(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    (%rsi), %rax
+        subq    (%rsp), %rax
+        movq    0x8(%rsi), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x10(%rsi), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x18(%rsi), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        movq    (%rsi), %rax
+        addq    (%rsp), %rax
+        movq    0x8(%rsi), %rcx
+        adcq    0x8(%rsp), %rcx
+        movq    0x10(%rsi), %r8
+        adcq    0x10(%rsp), %r8
+        movq    0x18(%rsi), %r9
+        adcq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        subq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        sbbq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        sbbq    $0x0, %r8
+        movq    %r8, 0x50(%rsp)
+        sbbq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        movq    0x60(%rsp), %rbx
+        movq    0x40(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x50(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x58(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x68(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x40(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x50(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x58(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x70(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x40(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x50(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x58(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x78(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x40(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x50(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x58(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        xorq    %r11, %r11
+        movq    0x20(%rsi), %rax
+        addq    0x40(%rsi), %rax
+        movq    0x28(%rsi), %rcx
+        adcq    0x48(%rsi), %rcx
+        movq    0x30(%rsi), %r8
+        adcq    0x50(%rsi), %r8
+        movq    0x38(%rsi), %r9
+        adcq    0x58(%rsi), %r9
+        adcq    %r11, %r11
+        subq    $0xffffffffffffffff, %rax
+        movl    $0xffffffff, %r10d
+        sbbq    %r10, %rcx
+        sbbq    $0x0, %r8
+        movabsq $0xffffffff00000001, %rdx
+        sbbq    %rdx, %r9
+        sbbq    $0x0, %r11
+        andq    %r11, %r10
+        andq    %r11, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x50(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        movq    0x20(%rsp), %rbx
+        movq    (%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x60(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x78(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x70(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x78(%rsp), %rbx
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x68(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x70(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x78(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    0x40(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rsp), %rbx
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    $0xffffffffffffffff, %r9
+        xorl    %r11d, %r11d
+        subq    0xa0(%rsp), %r9
+        movabsq $0xffffffff, %r10
+        sbbq    0xa8(%rsp), %r10
+        sbbq    0xb0(%rsp), %r11
+        movabsq $0xffffffff00000001, %r12
+        sbbq    0xb8(%rsp), %r12
+        movq    $0x9, %rcx
+        movq    %r9, %rax
+        mulq    %rcx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    %r10, %rax
+        xorl    %r10d, %r10d
+        mulq    %rcx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    %r11, %rax
+        xorl    %r11d, %r11d
+        mulq    %rcx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    %r12, %rax
+        xorl    %r12d, %r12d
+        mulq    %rcx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movl    $0xc, %ecx
+        movq    0x80(%rsp), %rax
+        mulq    %rcx
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %rbx, %rbx
+        movq    0x88(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rbx, %rbx
+        movq    0x90(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rbx, %rbx
+        movq    0x98(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        leaq    0x1(%r12), %rcx
+        movabsq $0xffffffff00000001, %rax
+        mulq    %rcx
+        movq    %rcx, %rbx
+        shlq    $0x20, %rbx
+        addq    %rcx, %r8
+        sbbq    $0x0, %rbx
+        subq    %rbx, %r9
+        sbbq    $0x0, %r10
+        sbbq    %rax, %r11
+        sbbq    %rdx, %rcx
+        decq    %rcx
+        movl    $0xffffffff, %eax
+        andq    %rcx, %rax
+        xorl    %edx, %edx
+        subq    %rax, %rdx
+        addq    %rcx, %r8
+        movq    %r8, 0xa0(%rsp)
+        adcq    %rax, %r9
+        movq    %r9, 0xa8(%rsp)
+        adcq    $0x0, %r10
+        movq    %r10, 0xb0(%rsp)
+        adcq    %rdx, %r11
+        movq    %r11, 0xb8(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x50(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        movq    0x20(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x38(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x30(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x38(%rsp), %rbx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x30(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x38(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x60(%rsp), %rbx
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x68(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x70(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x78(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x20(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x28(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x30(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x38(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rdi)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rdi)
+        adcq    $0x0, %r8
+        movq    %r8, 0x50(%rdi)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rdi)
+        movq    0x98(%rsp), %r11
+        movq    %r11, %rcx
+        movq    0x90(%rsp), %r10
+        shldq   $0x2, %r10, %r11
+        movq    0x88(%rsp), %r9
+        shldq   $0x2, %r9, %r10
+        movq    0x80(%rsp), %r8
+        shldq   $0x2, %r8, %r9
+        shlq    $0x2, %r8
+        shrq    $0x3e, %rcx
+        addq    $0x1, %rcx
+        subq    0xa0(%rsp), %r8
+        sbbq    0xa8(%rsp), %r9
+        sbbq    0xb0(%rsp), %r10
+        sbbq    0xb8(%rsp), %r11
+        sbbq    $0x0, %rcx
+        movabsq $0xffffffff00000001, %rax
+        mulq    %rcx
+        movq    %rcx, %rbx
+        shlq    $0x20, %rbx
+        addq    %rcx, %r8
+        sbbq    $0x0, %rbx
+        subq    %rbx, %r9
+        sbbq    $0x0, %r10
+        sbbq    %rax, %r11
+        sbbq    %rdx, %rcx
+        decq    %rcx
+        movl    $0xffffffff, %eax
+        andq    %rcx, %rax
+        xorl    %edx, %edx
+        subq    %rax, %rdx
+        addq    %rcx, %r8
+        movq    %r8, (%rdi)
+        adcq    %rax, %r9
+        movq    %r9, 0x8(%rdi)
+        adcq    $0x0, %r10
+        movq    %r10, 0x10(%rdi)
+        adcq    %rdx, %r11
+        movq    %r11, 0x18(%rdi)
+        movq    $0xffffffffffffffff, %r8
+        xorl    %r10d, %r10d
+        subq    (%rsp), %r8
+        movabsq $0xffffffff, %r9
+        sbbq    0x8(%rsp), %r9
+        sbbq    0x10(%rsp), %r10
+        movabsq $0xffffffff00000001, %r11
+        sbbq    0x18(%rsp), %r11
+        movq    %r11, %r12
+        shldq   $0x3, %r10, %r11
+        shldq   $0x3, %r9, %r10
+        shldq   $0x3, %r8, %r9
+        shlq    $0x3, %r8
+        shrq    $0x3d, %r12
+        movl    $0x3, %ecx
+        movq    0x60(%rsp), %rax
+        mulq    %rcx
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %rbx, %rbx
+        movq    0x68(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rbx, %rbx
+        movq    0x70(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rbx, %rbx
+        movq    0x78(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        leaq    0x1(%r12), %rcx
+        movabsq $0xffffffff00000001, %rax
+        mulq    %rcx
+        movq    %rcx, %rbx
+        shlq    $0x20, %rbx
+        addq    %rcx, %r8
+        sbbq    $0x0, %rbx
+        subq    %rbx, %r9
+        sbbq    $0x0, %r10
+        sbbq    %rax, %r11
+        sbbq    %rdx, %rcx
+        decq    %rcx
+        movl    $0xffffffff, %eax
+        andq    %rcx, %rax
+        xorl    %edx, %edx
+        subq    %rax, %rdx
+        addq    %rcx, %r8
+        movq    %r8, 0x20(%rdi)
+        adcq    %rax, %r9
+        movq    %r9, 0x28(%rdi)
+        adcq    $0x0, %r10
+        movq    %r10, 0x30(%rdi)
+        adcq    %rdx, %r11
+        movq    %r11, 0x38(%rdi)
+        addq    $0xc0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        ret
+
+p256_scalarmul_alt_local_p256_montjmixadd:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xc0, %rsp
+        movq    %rdx, %rbp
+        movq    0x40(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rsi), %rbx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x20(%rbp), %rbx
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rbp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rbp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rbp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    0x0(%rbp), %rbx
+        movq    (%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x8(%rbp), %rbx
+        xorl    %r13d, %r13d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rbp), %rbx
+        xorl    %r15d, %r15d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rbp), %rbx
+        xorl    %r8d, %r8d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    0x20(%rsp), %rbx
+        movq    (%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    (%rsi), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x8(%rsi), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x10(%rsi), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x18(%rsi), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0xa0(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0xa8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0xb0(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0xb8(%rsp)
+        movq    0x20(%rsp), %rax
+        subq    0x20(%rsi), %rax
+        movq    0x28(%rsp), %rcx
+        sbbq    0x28(%rsi), %rcx
+        movq    0x30(%rsp), %r8
+        sbbq    0x30(%rsi), %r8
+        movq    0x38(%rsp), %r9
+        sbbq    0x38(%rsi), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x20(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x28(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x30(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x38(%rsp)
+        movq    0xa0(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0xb8(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0xb0(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0xb8(%rsp), %rbx
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0xa8(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0xb0(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0xb8(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x20(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x38(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x30(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x38(%rsp), %rbx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x30(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x38(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    (%rsi), %rbx
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x8(%rsi), %rbx
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rsi), %rbx
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rsi), %rbx
+        xorl    %r8d, %r8d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x40(%rsp), %rbx
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x48(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    (%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        movq    0x40(%rsi), %rbx
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x48(%rsi), %rbx
+        xorl    %r13d, %r13d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rsi), %rbx
+        xorl    %r15d, %r15d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rsi), %rbx
+        xorl    %r8d, %r8d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    (%rsp), %rax
+        subq    0x40(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x48(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x50(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x58(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x20(%rsi), %rbx
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rsi), %rbx
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rsi), %rbx
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rsi), %rbx
+        xorl    %r8d, %r8d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x80(%rsp), %rbx
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x88(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x90(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x98(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    0x60(%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x68(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x70(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x78(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x40(%rsi), %rax
+        movq    0x48(%rsi), %rdx
+        orq     0x50(%rsi), %rax
+        orq     0x58(%rsi), %rdx
+        orq     %rdx, %rax
+        movq    (%rsp), %r8
+        movq    0x0(%rbp), %rax
+        cmoveq  %rax, %r8
+        movq    0x8(%rsp), %r9
+        movq    0x8(%rbp), %rax
+        cmoveq  %rax, %r9
+        movq    0x10(%rsp), %r10
+        movq    0x10(%rbp), %rax
+        cmoveq  %rax, %r10
+        movq    0x18(%rsp), %r11
+        movq    0x18(%rbp), %rax
+        cmoveq  %rax, %r11
+        movq    0x80(%rsp), %r12
+        movq    0x20(%rbp), %rax
+        cmoveq  %rax, %r12
+        movq    0x88(%rsp), %r13
+        movq    0x28(%rbp), %rax
+        cmoveq  %rax, %r13
+        movq    0x90(%rsp), %r14
+        movq    0x30(%rbp), %rax
+        cmoveq  %rax, %r14
+        movq    0x98(%rsp), %r15
+        movq    0x38(%rbp), %rax
+        cmoveq  %rax, %r15
+        movq    %r8, (%rdi)
+        movq    %r9, 0x8(%rdi)
+        movq    %r10, 0x10(%rdi)
+        movq    %r11, 0x18(%rdi)
+        movq    %r12, 0x20(%rdi)
+        movq    %r13, 0x28(%rdi)
+        movq    %r14, 0x30(%rdi)
+        movq    %r15, 0x38(%rdi)
+        movq    0xa0(%rsp), %r8
+        movq    0xa8(%rsp), %r9
+        movq    0xb0(%rsp), %r10
+        movq    0xb8(%rsp), %r11
+        movl    $0x1, %eax
+        cmoveq  %rax, %r8
+        movabsq $0xffffffff00000000, %rax
+        cmoveq  %rax, %r9
+        movq    $0xffffffffffffffff, %rax
+        cmoveq  %rax, %r10
+        movl    $0xfffffffe, %eax
+        cmoveq  %rax, %r11
+        movq    %r8, 0x40(%rdi)
+        movq    %r9, 0x48(%rdi)
+        movq    %r10, 0x50(%rdi)
+        movq    %r11, 0x58(%rdi)
+        addq    $0xc0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase.S
new file mode 100644
index 00000000000..442dc1331f0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase.S
@@ -0,0 +1,3532 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Scalar multiplication for precomputed point on NIST curve P-256
+// Input scalar[4], blocksize, table[]; output res[8]
+//
+// extern void p256_scalarmulbase
+//   (uint64_t res[static 8],
+//    uint64_t scalar[static 4],
+//    uint64_t blocksize,
+//    uint64_t *table);
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve P-256, the input argument "table" is expected to be a table of
+// multiples of the point P in Montgomery-affine form, with each block
+// corresponding to "blocksize" bits of the scalar as follows, where
+// B = 2^{blocksize-1} (e.g. B = 8 for blocksize = 4):
+//
+// For each i,j with blocksize * i <= 256 and 1 <= j <= B
+// the multiple 2^{blocksize * i} * j * P is stored at
+// tab[8 * (B * i + (j - 1))], considered as uint64_t pointers
+// or tab + 64 * (B * i + (j - 1)) as byte pointers.
+//
+// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = blocksize, RCX = table
+// Microsoft x64 ABI:   RCX = res, RDX = scalar, R8 = blocksize, R9 = table
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmulbase)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmulbase)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Intermediate variables on the stack. The last z2, z3 values can
+// safely be overlaid on "nacc", which is no longer needed at the end.
+// Uppercase syntactic variants make x86_att version simpler to generate
+
+#define RSCALAR (0*NUMSIZE)
+#define ACC (1*NUMSIZE)
+#define NACC (4*NUMSIZE)
+#define TABENT (7*NUMSIZE)
+#define Z2 (4*NUMSIZE)
+#define Z3 (5*NUMSIZE)
+
+#define rscalar RSCALAR(%rsp)
+#define acc ACC(%rsp)
+#define nacc NACC(%rsp)
+#define tabent TABENT(%rsp)
+
+#define z2 Z2(%rsp)
+#define z3 Z3(%rsp)
+
+#define res (9*NUMSIZE)(%rsp)
+#define blocksize (9*NUMSIZE+8)(%rsp)
+#define table (9*NUMSIZE+16)(%rsp)
+#define i (9*NUMSIZE+24)(%rsp)
+#define bf (9*NUMSIZE+32)(%rsp)
+#define cf (9*NUMSIZE+40)(%rsp)
+#define j (9*NUMSIZE+48)(%rsp)
+
+#define NSPACE (11*NUMSIZE)
+
+S2N_BN_SYMBOL(p256_scalarmulbase):
+        _CET_ENDBR
+
+// The Windows version literally calls the standard ABI version.
+// This simplifies the proofs since subroutine offsets are fixed.
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        callq   p256_scalarmulbase_standard
+        popq   %rsi
+        popq   %rdi
+        ret
+
+p256_scalarmulbase_standard:
+#endif
+
+// Real start of the standard ABI code.
+
+        pushq   %r15
+        pushq   %r14
+        pushq   %r13
+        pushq   %r12
+        pushq   %rbp
+        pushq   %rbx
+
+        subq    $NSPACE, %rsp
+
+// Preserve the input arguments except the scalar, since that gets absorbed
+// immediately. The "table" value subsequently gets shifted up each iteration
+// of the loop, while "res" and "blocksize" are static throughout.
+
+        movq    %rdi, res
+        movq    %rdx, blocksize
+        movq    %rcx, table
+
+// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12]
+
+        movq    $0xf3b9cac2fc632551, %r12
+        movq    $0xbce6faada7179e84, %r13
+        movq    $0xffffffffffffffff, %r14
+        movq    $0xffffffff00000000, %r15
+
+// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256
+// Store it to "rscalar" (reduced scalar)
+
+        movq    (%rsi), %r8
+        subq    %r12, %r8
+        movq    8(%rsi), %r9
+        sbbq    %r13, %r9
+        movq    16(%rsi), %r10
+        sbbq    %r14, %r10
+        movq    24(%rsi), %r11
+        sbbq    %r15, %r11
+
+        cmovcq  (%rsi), %r8
+        cmovcq  8(%rsi), %r9
+        cmovcq  16(%rsi), %r10
+        cmovcq  24(%rsi), %r11
+
+        movq    %r8, RSCALAR(%rsp)
+        movq    %r9, RSCALAR+8(%rsp)
+        movq    %r10, RSCALAR+16(%rsp)
+        movq    %r11, RSCALAR+24(%rsp)
+
+// Initialize the accumulator to all zeros and the "carry flag" cf to 0
+
+        xorl    %eax, %eax
+
+        movq    %rax, ACC(%rsp)
+        movq    %rax, ACC+8(%rsp)
+        movq    %rax, ACC+16(%rsp)
+        movq    %rax, ACC+24(%rsp)
+        movq    %rax, ACC+32(%rsp)
+        movq    %rax, ACC+40(%rsp)
+        movq    %rax, ACC+48(%rsp)
+        movq    %rax, ACC+56(%rsp)
+        movq    %rax, ACC+64(%rsp)
+        movq    %rax, ACC+72(%rsp)
+        movq    %rax, ACC+80(%rsp)
+        movq    %rax, ACC+88(%rsp)
+
+        movq    %rax, cf
+
+// Main loop over {i >= 0 | blocksize * i <= 256}. Note the non-strict
+// inequality, to allow top carry for any choices of blocksize.
+
+        movq    %rax, i
+
+p256_scalarmulbase_loop:
+
+// The next raw bitfield is bf = bitfield(blocksize * i,blocksize) + cf,
+// adding in the deferred carry cf. We then shift the whole scalar right
+// by blocksize so we can keep picking bitfield(0,blocksize).
+
+        movq    RSCALAR(%rsp), %r8
+        movq    RSCALAR+8(%rsp), %r9
+        movq    RSCALAR+16(%rsp), %r10
+        movq    RSCALAR+24(%rsp), %r11
+
+        movq    blocksize, %rcx
+        movl    $1, %eax
+        shlq    %cl, %rax
+        decq    %rax
+        andq    %r8, %rax
+
+        shrdq   %cl, %r9, %r8
+        shrdq   %cl, %r10, %r9
+        shrdq   %cl, %r11, %r10
+        shrq    %cl, %r11
+
+        addq    cf, %rax
+        movq    %rax, bf
+
+        movq   %r8, RSCALAR(%rsp)
+        movq   %r9, RSCALAR+8(%rsp)
+        movq   %r10, RSCALAR+16(%rsp)
+        movq   %r11, RSCALAR+24(%rsp)
+
+// Now if bf <= B we just select entry j, unnegated and set cf = 0.
+// If bf > B we set j = 2 * B - bf and negate the j'th entry, setting cf = 1.
+// In either case we ultimately add bf, in the latter case with deferred
+// carry as 2 * B - (2 * B - bf) = bf.
+
+        movl    $1, %eax
+        movq    blocksize, %rcx
+        shlq    %cl, %rax
+        movq    %rax, %rbx
+        shrq    $1, %rax
+
+        subq    bf, %rbx
+        cmpq    bf, %rax
+
+        cmovncq bf, %rbx
+        sbbq    %rax, %rax
+        movq    %rbx, j
+        negq    %rax
+        movq    %rax, cf
+
+// Load table entry j - 1 for nonzero j in constant-time style.
+
+        movq    blocksize, %rcx
+        decq    %rcx
+        movl    $1, %esi
+        shlq    %cl, %rsi
+        movq    j, %r12
+        movq    table, %rbp
+
+p256_scalarmulbase_tabloop:
+        subq    $1, %r12
+        cmovzq  (%rbp), %rax
+        cmovzq  8(%rbp), %rbx
+        cmovzq  16(%rbp), %rcx
+        cmovzq  24(%rbp), %rdx
+        cmovzq  32(%rbp), %r8
+        cmovzq  40(%rbp), %r9
+        cmovzq  48(%rbp), %r10
+        cmovzq  56(%rbp), %r11
+
+        addq    $64, %rbp
+        decq    %rsi
+        jnz     p256_scalarmulbase_tabloop
+
+        movq    %rbp, table
+
+// Before storing back, optionally negate the y coordinate of the table entry
+
+        xorl    %r14d, %r14d
+        leaq    -1(%r14), %r12
+        movq    $0x00000000ffffffff, %r15
+        movq    %r15, %r13
+        negq    %r15
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        movq    %rax, TABENT(%rsp)
+        movq    %rbx, TABENT+8(%rsp)
+        movq    %rcx, TABENT+16(%rsp)
+        movq    %rdx, TABENT+24(%rsp)
+
+        movq    cf, %rax
+        testq   %rax, %rax
+        cmovnzq %r12, %r8
+        cmovnzq %r13, %r9
+        cmovnzq %r14, %r10
+        cmovnzq %r15, %r11
+
+        movq    %r8, TABENT+32(%rsp)
+        movq    %r9, TABENT+40(%rsp)
+        movq    %r10, TABENT+48(%rsp)
+        movq    %r11, TABENT+56(%rsp)
+
+// Add the adjusted table point to the accumulator
+
+        leaq    NACC(%rsp), %rdi
+        leaq    ACC(%rsp), %rsi
+        leaq    TABENT(%rsp), %rdx
+        callq   p256_scalarmulbase_local_p256_montjmixadd
+
+// However, only commit that update to the accumulator if j is nonzero,
+// because the mixed addition function does not handle this case directly,
+// and in any case we didn't choose the table entry appropriately.
+
+        movq    j, %rax
+        testq   %rax, %rax
+
+        movq    ACC(%rsp), %rax
+        cmovnzq NACC(%rsp), %rax
+        movq    %rax, ACC(%rsp)
+
+        movq    ACC+8(%rsp), %rax
+        cmovnzq NACC+8(%rsp), %rax
+        movq    %rax, ACC+8(%rsp)
+
+        movq    ACC+16(%rsp), %rax
+        cmovnzq NACC+16(%rsp), %rax
+        movq    %rax, ACC+16(%rsp)
+
+        movq    ACC+24(%rsp), %rax
+        cmovnzq NACC+24(%rsp), %rax
+        movq    %rax, ACC+24(%rsp)
+
+        movq    ACC+32(%rsp), %rax
+        cmovnzq NACC+32(%rsp), %rax
+        movq    %rax, ACC+32(%rsp)
+
+        movq    ACC+40(%rsp), %rax
+        cmovnzq NACC+40(%rsp), %rax
+        movq    %rax, ACC+40(%rsp)
+
+        movq    ACC+48(%rsp), %rax
+        cmovnzq NACC+48(%rsp), %rax
+        movq    %rax, ACC+48(%rsp)
+
+        movq    ACC+56(%rsp), %rax
+        cmovnzq NACC+56(%rsp), %rax
+        movq    %rax, ACC+56(%rsp)
+
+        movq    ACC+64(%rsp), %rax
+        cmovnzq NACC+64(%rsp), %rax
+        movq    %rax, ACC+64(%rsp)
+
+        movq    ACC+72(%rsp), %rax
+        cmovnzq NACC+72(%rsp), %rax
+        movq    %rax, ACC+72(%rsp)
+
+        movq    ACC+80(%rsp), %rax
+        cmovnzq NACC+80(%rsp), %rax
+        movq    %rax, ACC+80(%rsp)
+
+        movq    ACC+88(%rsp), %rax
+        cmovnzq NACC+88(%rsp), %rax
+        movq    %rax, ACC+88(%rsp)
+
+// Loop while blocksize * i <= 256
+
+        movq    i, %rax
+        incq    %rax
+        movq    %rax, i
+
+        imulq   blocksize, %rax
+        cmpq    $257, %rax
+        jc      p256_scalarmulbase_loop
+
+// That's the end of the main loop, and we just need to translate
+// back from the Jacobian representation to affine. First of all,
+// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form
+
+        leaq    Z2(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        callq   p256_scalarmulbase_local_montsqr_p256
+
+        leaq    Z3(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        leaq    Z2(%rsp), %rdx
+        callq   p256_scalarmulbase_local_montmul_p256
+
+        leaq    Z2(%rsp), %rdi
+        leaq    Z3(%rsp), %rsi
+        callq   p256_scalarmulbase_local_demont_p256
+
+        leaq    Z3(%rsp), %rdi
+        leaq    Z2(%rsp), %rsi
+        callq   p256_scalarmulbase_local_inv_p256
+
+        leaq    Z2(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        leaq    Z3(%rsp), %rdx
+        callq   p256_scalarmulbase_local_montmul_p256
+
+// Convert back from Jacobian (X, Y, Z) |-> (X/Z^2, Y/Z^3)
+
+        movq    res, %rdi
+        leaq    ACC(%rsp), %rsi
+        leaq    Z2(%rsp), %rdx
+        movq    %rdi, %rbx
+        callq   p256_scalarmulbase_local_montmul_p256
+
+        leaq    32(%rbx), %rdi
+        leaq    ACC+32(%rsp), %rsi
+        leaq    Z3(%rsp), %rdx
+        callq   p256_scalarmulbase_local_montmul_p256
+
+// Restore stack and registers and return
+
+        addq    $NSPACE, %rsp
+        popq    %rbx
+        popq    %rbp
+        popq    %r12
+        popq    %r13
+        popq    %r14
+        popq    %r15
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+p256_scalarmulbase_local_demont_p256:
+        pushq   %rbx
+        movq    (%rsi), %r8
+        movq    0x8(%rsi), %r9
+        movq    0x10(%rsi), %r10
+        movq    0x18(%rsi), %r11
+        xorq    %rbx, %rbx
+        xorq    %rsi, %rsi
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rcx
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+        mulxq   %r9, %rax, %rcx
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rcx
+        adcxq   %rax, %r11
+        adoxq   %rcx, %rbx
+        mulxq   %r9, %rax, %rcx
+        adcxq   %rax, %rbx
+        adoxq   %rcx, %rsi
+        movl    $0x0, %r8d
+        adcxq   %r8, %rsi
+        xorq    %r9, %r9
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rcx
+        adcxq   %rax, %r11
+        adoxq   %rcx, %rbx
+        mulxq   %r11, %rax, %rcx
+        adcxq   %rax, %rbx
+        adoxq   %rcx, %rsi
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rcx
+        adcxq   %rax, %rsi
+        adoxq   %rcx, %r8
+        mulxq   %r11, %rax, %rcx
+        adcxq   %rax, %r8
+        adoxq   %rcx, %r9
+        movl    $0x0, %r10d
+        adcxq   %r10, %r9
+        movq    %rbx, (%rdi)
+        movq    %rsi, 0x8(%rdi)
+        movq    %r8, 0x10(%rdi)
+        movq    %r9, 0x18(%rdi)
+        popq    %rbx
+        ret
+
+p256_scalarmulbase_local_inv_p256:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xf0, %rsp
+        movq    %rdi, 0xe0(%rsp)
+        xorl    %ecx, %ecx
+        movl    $0xffffffff, %edx
+        movq    %rdx, %rbx
+        leaq    -0x1(%rcx), %rax
+        negq    %rdx
+        movq    %rax, (%rsp)
+        movq    %rbx, 0x8(%rsp)
+        movq    %rcx, 0x10(%rsp)
+        movq    %rdx, 0x18(%rsp)
+        movq    %rcx, 0x20(%rsp)
+        movq    (%rsi), %r8
+        movq    0x8(%rsi), %r9
+        movq    0x10(%rsi), %r10
+        movq    0x18(%rsi), %r11
+        leaq    0x1(%rcx), %rax
+        addq    %r8, %rax
+        leaq    -0x1(%rdx), %rbx
+        adcq    %r9, %rbx
+        notq    %rcx
+        adcq    %r10, %rcx
+        notq    %rdx
+        adcq    %r11, %rdx
+        cmovaeq %r8, %rax
+        cmovaeq %r9, %rbx
+        cmovaeq %r10, %rcx
+        cmovaeq %r11, %rdx
+        movq    %rax, 0x28(%rsp)
+        movq    %rbx, 0x30(%rsp)
+        movq    %rcx, 0x38(%rsp)
+        movq    %rdx, 0x40(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, 0x48(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, 0x50(%rsp)
+        movq    %rax, 0x58(%rsp)
+        movq    %rax, 0x60(%rsp)
+        movq    %rax, 0x68(%rsp)
+        movq    $0x4000000000000, %rcx
+        movq    %rcx, 0x78(%rsp)
+        movq    %rax, 0x80(%rsp)
+        movq    %rax, 0x88(%rsp)
+        movq    %rax, 0x90(%rsp)
+        movq    $0xa,  0xb0(%rsp)
+        movq    $0x1,  0xb8(%rsp)
+        jmp     p256_scalarmulbase_inv_midloop
+p256_scalarmulbase_inv_loop:
+        movq    %r8, %r9
+        sarq    $0x3f, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        movq    %r10, %r11
+        sarq    $0x3f, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        movq    %r12, %r13
+        sarq    $0x3f, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        movq    %r14, %r15
+        sarq    $0x3f, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %rdi
+        andq    %r11, %rdi
+        addq    %rax, %rdi
+        movq    %rdi, 0xa0(%rsp)
+        movq    %r12, %rax
+        andq    %r13, %rax
+        movq    %r14, %rsi
+        andq    %r15, %rsi
+        addq    %rax, %rsi
+        movq    %rsi, 0xa8(%rsp)
+        xorl    %ebx, %ebx
+        movq    (%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    0x28(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        xorl    %ebp, %ebp
+        movq    (%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x28(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        xorl    %ecx, %ecx
+        movq    0x8(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x30(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        shrdq   $0x3b, %rbx, %rdi
+        movq    %rdi, (%rsp)
+        xorl    %edi, %edi
+        movq    0x8(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        movq    0x30(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        shrdq   $0x3b, %rbp, %rsi
+        movq    %rsi, 0x28(%rsp)
+        xorl    %esi, %esi
+        movq    0x10(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        movq    0x38(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        shrdq   $0x3b, %rcx, %rbx
+        movq    %rbx, 0x8(%rsp)
+        xorl    %ebx, %ebx
+        movq    0x10(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    0x38(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        shrdq   $0x3b, %rdi, %rbp
+        movq    %rbp, 0x30(%rsp)
+        movq    0x18(%rsp), %rax
+        xorq    %r9, %rax
+        movq    0x20(%rsp), %rbp
+        xorq    %r9, %rbp
+        andq    %r8, %rbp
+        negq    %rbp
+        mulq    %r8
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x40(%rsp), %rax
+        xorq    %r11, %rax
+        movq    0x48(%rsp), %rdx
+        xorq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbp
+        mulq    %r10
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        shrdq   $0x3b, %rsi, %rcx
+        movq    %rcx, 0x10(%rsp)
+        shrdq   $0x3b, %rbp, %rsi
+        sarq    $0x3b, %rbp
+        movq    0x18(%rsp), %rax
+        movq    %rsi, 0x18(%rsp)
+        movq    0x20(%rsp), %rsi
+        movq    %rbp, 0x20(%rsp)
+        xorq    %r13, %rax
+        xorq    %r13, %rsi
+        andq    %r12, %rsi
+        negq    %rsi
+        mulq    %r12
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        movq    0x40(%rsp), %rax
+        xorq    %r15, %rax
+        movq    0x48(%rsp), %rdx
+        xorq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rsi
+        mulq    %r14
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        shrdq   $0x3b, %rbx, %rdi
+        movq    %rdi, 0x38(%rsp)
+        shrdq   $0x3b, %rsi, %rbx
+        movq    %rbx, 0x40(%rsp)
+        sarq    $0x3b, %rsi
+        movq    %rsi, 0x48(%rsp)
+        movq    0xa0(%rsp), %rbx
+        movq    0xa8(%rsp), %rbp
+        xorl    %ecx, %ecx
+        movq    0x50(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x78(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        xorl    %esi, %esi
+        movq    0x50(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, 0x50(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    0x78(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, 0x78(%rsp)
+        xorl    %ebx, %ebx
+        movq    0x58(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    0x80(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        xorl    %ebp, %ebp
+        movq    0x58(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rcx, 0x58(%rsp)
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x80(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    %rsi, 0x80(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x60(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x88(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        xorl    %esi, %esi
+        movq    0x60(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, 0x60(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    0x88(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, 0x88(%rsp)
+        movq    0x68(%rsp), %rax
+        xorq    %r9, %rax
+        movq    %r9, %rbx
+        andq    %r8, %rbx
+        negq    %rbx
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    0x90(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbx
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rbx, %rdx
+        movq    0x68(%rsp), %rax
+        movq    %rcx, 0x68(%rsp)
+        movq    %rdx, 0x70(%rsp)
+        xorq    %r13, %rax
+        movq    %r13, %rcx
+        andq    %r12, %rcx
+        negq    %rcx
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rcx
+        movq    0x90(%rsp), %rax
+        xorq    %r15, %rax
+        movq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rcx
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rcx, %rdx
+        movq    %rsi, 0x90(%rsp)
+        movq    %rdx, 0x98(%rsp)
+        movq    $0xe000000000000000, %r8
+        addq    0x50(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x58(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x60(%rsp), %r10
+        movq    $0x2000000000000000, %r11
+        adcq    0x68(%rsp), %r11
+        movq    $0x1fffffffe0000000, %r12
+        adcq    0x70(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movq    $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movq    $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x50(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x58(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x60(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x68(%rsp)
+        movq    $0xe000000000000000, %r8
+        addq    0x78(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x80(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x88(%rsp), %r10
+        movq    $0x2000000000000000, %r11
+        adcq    0x90(%rsp), %r11
+        movq    $0x1fffffffe0000000, %r12
+        adcq    0x98(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movq    $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movq    $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x78(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x80(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x88(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x90(%rsp)
+p256_scalarmulbase_inv_midloop:
+        movq    0xb8(%rsp), %rsi
+        movq    (%rsp), %rdx
+        movq    0x28(%rsp), %rcx
+        movq    %rdx, %rbx
+        andq    $0xfffff, %rbx
+        movq    $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        andq    $0xfffff, %rcx
+        movq    $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    $0xfffffffffffffffe, %rax
+        xorl    %ebp, %ebp
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %rdx
+        leaq    (%rcx,%rax), %rdi
+        shlq    $0x16, %rdx
+        shlq    $0x16, %rdi
+        sarq    $0x2b, %rdx
+        sarq    $0x2b, %rdi
+        movq    $0x20000100000, %rax
+        leaq    (%rbx,%rax), %rbx
+        leaq    (%rcx,%rax), %rcx
+        sarq    $0x2a, %rbx
+        sarq    $0x2a, %rcx
+        movq    %rdx, 0xc0(%rsp)
+        movq    %rbx, 0xc8(%rsp)
+        movq    %rdi, 0xd0(%rsp)
+        movq    %rcx, 0xd8(%rsp)
+        movq    (%rsp), %r12
+        imulq   %r12, %rdi
+        imulq   %rdx, %r12
+        movq    0x28(%rsp), %r13
+        imulq   %r13, %rbx
+        imulq   %rcx, %r13
+        addq    %rbx, %r12
+        addq    %rdi, %r13
+        sarq    $0x14, %r12
+        sarq    $0x14, %r13
+        movq    %r12, %rbx
+        andq    $0xfffff, %rbx
+        movq    $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        movq    %r13, %rcx
+        andq    $0xfffff, %rcx
+        movq    $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    $0xfffffffffffffffe, %rax
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %r8
+        leaq    (%rcx,%rax), %r10
+        shlq    $0x16, %r8
+        shlq    $0x16, %r10
+        sarq    $0x2b, %r8
+        sarq    $0x2b, %r10
+        movq    $0x20000100000, %rax
+        leaq    (%rbx,%rax), %r15
+        leaq    (%rcx,%rax), %r11
+        sarq    $0x2a, %r15
+        sarq    $0x2a, %r11
+        movq    %r13, %rbx
+        movq    %r12, %rcx
+        imulq   %r8, %r12
+        imulq   %r15, %rbx
+        addq    %rbx, %r12
+        imulq   %r11, %r13
+        imulq   %r10, %rcx
+        addq    %rcx, %r13
+        sarq    $0x14, %r12
+        sarq    $0x14, %r13
+        movq    %r12, %rbx
+        andq    $0xfffff, %rbx
+        movq    $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        movq    %r13, %rcx
+        andq    $0xfffff, %rcx
+        movq    $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    0xc0(%rsp), %rax
+        imulq   %r8, %rax
+        movq    0xd0(%rsp), %rdx
+        imulq   %r15, %rdx
+        imulq   0xc8(%rsp), %r8
+        imulq   0xd8(%rsp), %r15
+        addq    %r8, %r15
+        leaq    (%rax,%rdx), %r9
+        movq    0xc0(%rsp), %rax
+        imulq   %r10, %rax
+        movq    0xd0(%rsp), %rdx
+        imulq   %r11, %rdx
+        imulq   0xc8(%rsp), %r10
+        imulq   0xd8(%rsp), %r11
+        addq    %r10, %r11
+        leaq    (%rax,%rdx), %r13
+        movq    $0xfffffffffffffffe, %rax
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %r8
+        leaq    (%rcx,%rax), %r12
+        shlq    $0x15, %r8
+        shlq    $0x15, %r12
+        sarq    $0x2b, %r8
+        sarq    $0x2b, %r12
+        movq    $0x20000100000, %rax
+        leaq    (%rbx,%rax), %r10
+        leaq    (%rcx,%rax), %r14
+        sarq    $0x2b, %r10
+        sarq    $0x2b, %r14
+        movq    %r9, %rax
+        imulq   %r8, %rax
+        movq    %r13, %rdx
+        imulq   %r10, %rdx
+        imulq   %r15, %r8
+        imulq   %r11, %r10
+        addq    %r8, %r10
+        leaq    (%rax,%rdx), %r8
+        movq    %r9, %rax
+        imulq   %r12, %rax
+        movq    %r13, %rdx
+        imulq   %r14, %rdx
+        imulq   %r15, %r12
+        imulq   %r11, %r14
+        addq    %r12, %r14
+        leaq    (%rax,%rdx), %r12
+        movq    %rsi, 0xb8(%rsp)
+        decq     0xb0(%rsp)
+        jne     p256_scalarmulbase_inv_loop
+        movq    (%rsp), %rax
+        movq    0x28(%rsp), %rcx
+        imulq   %r8, %rax
+        imulq   %r10, %rcx
+        addq    %rcx, %rax
+        sarq    $0x3f, %rax
+        movq    %r8, %r9
+        sarq    $0x3f, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        xorq    %rax, %r9
+        movq    %r10, %r11
+        sarq    $0x3f, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        xorq    %rax, %r11
+        movq    %r12, %r13
+        sarq    $0x3f, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        xorq    %rax, %r13
+        movq    %r14, %r15
+        sarq    $0x3f, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        xorq    %rax, %r15
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %r12
+        andq    %r11, %r12
+        addq    %rax, %r12
+        xorl    %r13d, %r13d
+        movq    0x50(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        movq    0x78(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movq    0x58(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        movq    0x80(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    0x88(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    0x68(%rsp), %rax
+        xorq    %r9, %rax
+        andq    %r8, %r9
+        negq    %r9
+        mulq    %r8
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    0x90(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %r9
+        mulq    %r10
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    %r12, 0x50(%rsp)
+        movq    %r13, 0x58(%rsp)
+        movq    %r14, 0x60(%rsp)
+        movq    %r15, 0x68(%rsp)
+        movq    %r9, 0x70(%rsp)
+        movq    $0xe000000000000000, %r8
+        addq    0x50(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x58(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x60(%rsp), %r10
+        movq    $0x2000000000000000, %r11
+        adcq    0x68(%rsp), %r11
+        movq    $0x1fffffffe0000000, %r12
+        adcq    0x70(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movq    $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movq    $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x50(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x58(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x60(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x68(%rsp)
+        movq    0x50(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        movq    0x60(%rsp), %r10
+        movq    0x68(%rsp), %r11
+        movl    $0x1, %eax
+        movl    $0xffffffff, %ebx
+        leaq    -0x2(%rax), %rcx
+        leaq    -0x1(%rbx), %rdx
+        notq    %rbx
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+        cmovaeq %r8, %rax
+        cmovaeq %r9, %rbx
+        cmovaeq %r10, %rcx
+        cmovaeq %r11, %rdx
+        movq    0xe0(%rsp), %rdi
+        movq    %rax, (%rdi)
+        movq    %rbx, 0x8(%rdi)
+        movq    %rcx, 0x10(%rdi)
+        movq    %rdx, 0x18(%rdi)
+        addq    $0xf0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+p256_scalarmulbase_local_montmul_p256:
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    %rdx, %rcx
+        xorl    %r13d, %r13d
+        movq    (%rcx), %rdx
+        mulxq   (%rsi), %r8, %r9
+        mulxq   0x8(%rsi), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x10(%rsi), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x18(%rsi), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x8(%rcx), %rdx
+        xorl    %r14d, %r14d
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x10(%rcx), %rdx
+        xorl    %r8d, %r8d
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x18(%rsi), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x18(%rcx), %rdx
+        xorl    %r9d, %r9d
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x18(%rsi), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        ret
+
+p256_scalarmulbase_local_montsqr_p256:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    (%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x8(%rsi), %r9, %r10
+        mulxq   0x18(%rsi), %r11, %r12
+        movq    0x10(%rsi), %rdx
+        mulxq   0x18(%rsi), %r13, %r14
+        xorl    %ebp, %ebp
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x18(%rsi), %rdx
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        adoxq   %rbp, %r14
+        adcq    %rbp, %r14
+        xorl    %ebp, %ebp
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x8(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x10(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x18(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r15
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rbp, %r13
+        movl    %ebp, %r9d
+        adoxq   %rbp, %r9
+        adcxq   %rbp, %r9
+        addq    %r9, %r14
+        adcq    %rbp, %r15
+        movl    %ebp, %r8d
+        adcq    %rbp, %r8
+        xorl    %ebp, %ebp
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rbp, %r15
+        adoxq   %rbp, %r8
+        adcq    %rbp, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbp), %rbp
+        movq    %rbp, %rax
+        adcq    %r14, %rbp
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbp, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+p256_scalarmulbase_local_p256_montjmixadd:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xc0, %rsp
+        movq    %rdx, %rbp
+        movq    0x40(%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rsi), %r9, %r10
+        mulxq   0x58(%rsi), %r11, %r12
+        movq    0x50(%rsi), %rdx
+        mulxq   0x58(%rsi), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rsi), %rdx
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %r8d
+        leaq    -0x1(%rdx), %rdx
+        leaq    -0x1(%rcx), %rax
+        movl    $0xfffffffe, %r11d
+        cmoveq  %rcx, %r8
+        cmoveq  %rcx, %rdx
+        cmoveq  %rcx, %rax
+        cmoveq  %rcx, %r11
+        addq    %r8, %r12
+        adcq    %rdx, %r13
+        adcq    %rax, %r14
+        adcq    %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rbp), %rdx
+        mulxq   0x40(%rsi), %r8, %r9
+        mulxq   0x48(%rsi), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x50(%rsi), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x58(%rsi), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rbp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x58(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rbp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x58(%rsi), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rbp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x58(%rsi), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x0(%rbp), %rdx
+        mulxq   (%rsp), %r8, %r9
+        mulxq   0x8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x10(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x18(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x8(%rbp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x10(%rbp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x18(%rbp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rsp), %rdx
+        mulxq   (%rsp), %r8, %r9
+        mulxq   0x8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x10(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x18(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x18(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    (%rsi), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x8(%rsi), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x10(%rsi), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x18(%rsi), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0xa0(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0xa8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0xb0(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0xb8(%rsp)
+        movq    0x20(%rsp), %rax
+        subq    0x20(%rsi), %rax
+        movq    0x28(%rsp), %rcx
+        sbbq    0x28(%rsi), %rcx
+        movq    0x30(%rsp), %r8
+        sbbq    0x30(%rsi), %r8
+        movq    0x38(%rsp), %r9
+        sbbq    0x38(%rsi), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x20(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x28(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x30(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x38(%rsp)
+        movq    0xa0(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0xa8(%rsp), %r9, %r10
+        mulxq   0xb8(%rsp), %r11, %r12
+        movq    0xb0(%rsp), %rdx
+        mulxq   0xb8(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0xb8(%rsp), %rdx
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0xa8(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0xb0(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0xb8(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %r8d
+        leaq    -0x1(%rdx), %rdx
+        leaq    -0x1(%rcx), %rax
+        movl    $0xfffffffe, %r11d
+        cmoveq  %rcx, %r8
+        cmoveq  %rcx, %rdx
+        cmoveq  %rcx, %rax
+        cmoveq  %rcx, %r11
+        addq    %r8, %r12
+        adcq    %rdx, %r13
+        adcq    %rax, %r14
+        adcq    %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x20(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x28(%rsp), %r9, %r10
+        mulxq   0x38(%rsp), %r11, %r12
+        movq    0x30(%rsp), %rdx
+        mulxq   0x38(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x38(%rsp), %rdx
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x28(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x30(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x38(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        movl    %ecx, %r9d
+        adoxq   %rcx, %r9
+        adcxq   %rcx, %r9
+        addq    %r9, %r14
+        adcq    %rcx, %r15
+        movl    %ecx, %r8d
+        adcq    %rcx, %r8
+        xorl    %ecx, %ecx
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        movq    $0xffffffff00000001, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r8
+        adcq    %rcx, %r8
+        movl    $0x1, %ebx
+        addq    %r12, %rbx
+        leaq    -0x1(%rdx), %rdx
+        adcq    %r13, %rdx
+        leaq    -0x1(%rcx), %rcx
+        movq    %rcx, %rax
+        adcq    %r14, %rcx
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rbx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rcx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        xorl    %r13d, %r13d
+        movq    (%rsi), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x70(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x78(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x8(%rsi), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x10(%rsi), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x18(%rsi), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x40(%rsp), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x70(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x78(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x48(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x50(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x58(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    (%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x40(%rsi), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0xb0(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0xb8(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x48(%rsi), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x50(%rsi), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x58(%rsi), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0xb8(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    (%rsp), %rax
+        subq    0x40(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x48(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x50(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x58(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x20(%rsi), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x70(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x78(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x28(%rsi), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x30(%rsi), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x38(%rsi), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x78(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        xorl    %r13d, %r13d
+        movq    0x80(%rsp), %rdx
+        mulxq   0x20(%rsp), %r8, %r9
+        mulxq   0x28(%rsp), %rbx, %r10
+        adcq    %rbx, %r9
+        mulxq   0x30(%rsp), %rbx, %r11
+        adcq    %rbx, %r10
+        mulxq   0x38(%rsp), %rbx, %r12
+        adcq    %rbx, %r11
+        adcq    %r13, %r12
+        movq    0x88(%rsp), %rdx
+        xorl    %r14d, %r14d
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x38(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcq    %r14, %r13
+        xorl    %r15d, %r15d
+        movq    $0x100000000, %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r8, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r9, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %r15, %r13
+        adoxq   %r15, %r14
+        adcq    %r15, %r14
+        movq    0x90(%rsp), %rdx
+        xorl    %r8d, %r8d
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adoxq   %r8, %r14
+        mulxq   0x38(%rsp), %rax, %rbx
+        adcq    %rax, %r13
+        adcq    %rbx, %r14
+        adcq    %r8, %r15
+        movq    0x98(%rsp), %rdx
+        xorl    %r9d, %r9d
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        adoxq   %r9, %r15
+        mulxq   0x38(%rsp), %rax, %rbx
+        adcq    %rax, %r14
+        adcq    %rbx, %r15
+        adcq    %r9, %r8
+        xorl    %r9d, %r9d
+        movq    $0x100000000, %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        notq    %rdx
+        leaq    0x2(%rdx), %rdx
+        mulxq   %r10, %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   %r11, %rax, %rbx
+        adcxq   %rax, %r14
+        adoxq   %rbx, %r15
+        adcxq   %r9, %r15
+        adoxq   %r9, %r8
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rdx
+        adcq    %r13, %rdx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    0x60(%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x68(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x70(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x78(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x40(%rsi), %rax
+        movq    0x48(%rsi), %rdx
+        orq     0x50(%rsi), %rax
+        orq     0x58(%rsi), %rdx
+        orq     %rdx, %rax
+        movq    (%rsp), %r8
+        movq    0x0(%rbp), %rax
+        cmoveq  %rax, %r8
+        movq    0x8(%rsp), %r9
+        movq    0x8(%rbp), %rax
+        cmoveq  %rax, %r9
+        movq    0x10(%rsp), %r10
+        movq    0x10(%rbp), %rax
+        cmoveq  %rax, %r10
+        movq    0x18(%rsp), %r11
+        movq    0x18(%rbp), %rax
+        cmoveq  %rax, %r11
+        movq    0x80(%rsp), %r12
+        movq    0x20(%rbp), %rax
+        cmoveq  %rax, %r12
+        movq    0x88(%rsp), %r13
+        movq    0x28(%rbp), %rax
+        cmoveq  %rax, %r13
+        movq    0x90(%rsp), %r14
+        movq    0x30(%rbp), %rax
+        cmoveq  %rax, %r14
+        movq    0x98(%rsp), %r15
+        movq    0x38(%rbp), %rax
+        cmoveq  %rax, %r15
+        movq    %r8, (%rdi)
+        movq    %r9, 0x8(%rdi)
+        movq    %r10, 0x10(%rdi)
+        movq    %r11, 0x18(%rdi)
+        movq    %r12, 0x20(%rdi)
+        movq    %r13, 0x28(%rdi)
+        movq    %r14, 0x30(%rdi)
+        movq    %r15, 0x38(%rdi)
+        movq    0xa0(%rsp), %r8
+        movq    0xa8(%rsp), %r9
+        movq    0xb0(%rsp), %r10
+        movq    0xb8(%rsp), %r11
+        movl    $0x1, %eax
+        cmoveq  %rax, %r8
+        movq    $0xffffffff00000000, %rax
+        cmoveq  %rax, %r9
+        movq    $0xffffffffffffffff, %rax
+        cmoveq  %rax, %r10
+        movl    $0xfffffffe, %eax
+        cmoveq  %rax, %r11
+        movq    %r8, 0x40(%rdi)
+        movq    %r9, 0x48(%rdi)
+        movq    %r10, 0x50(%rdi)
+        movq    %r11, 0x58(%rdi)
+        addq    $0xc0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase_alt.S
new file mode 100644
index 00000000000..14191e4d55e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase_alt.S
@@ -0,0 +1,4173 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Scalar multiplication for precomputed point on NIST curve P-256
+// Input scalar[4], blocksize, table[]; output res[8]
+//
+// extern void p256_scalarmulbase_alt
+//   (uint64_t res[static 8],
+//    uint64_t scalar[static 4],
+//    uint64_t blocksize,
+//    uint64_t *table);
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve P-256, the input argument "table" is expected to be a table of
+// multiples of the point P in Montgomery-affine form, with each block
+// corresponding to "blocksize" bits of the scalar as follows, where
+// B = 2^{blocksize-1} (e.g. B = 8 for blocksize = 4):
+//
+// For each i,j with blocksize * i <= 256 and 1 <= j <= B
+// the multiple 2^{blocksize * i} * j * P is stored at
+// tab[8 * (B * i + (j - 1))], considered as uint64_t pointers
+// or tab + 64 * (B * i + (j - 1)) as byte pointers.
+//
+// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = blocksize, RCX = table
+// Microsoft x64 ABI:   RCX = res, RDX = scalar, R8 = blocksize, R9 = table
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmulbase_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmulbase_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Intermediate variables on the stack. The last z2, z3 values can
+// safely be overlaid on "nacc", which is no longer needed at the end.
+// Uppercase syntactic variants make x86_att version simpler to generate
+
+#define RSCALAR (0*NUMSIZE)
+#define ACC (1*NUMSIZE)
+#define NACC (4*NUMSIZE)
+#define TABENT (7*NUMSIZE)
+#define Z2 (4*NUMSIZE)
+#define Z3 (5*NUMSIZE)
+
+#define rscalar RSCALAR(%rsp)
+#define acc ACC(%rsp)
+#define nacc NACC(%rsp)
+#define tabent TABENT(%rsp)
+
+#define z2 Z2(%rsp)
+#define z3 Z3(%rsp)
+
+#define res (9*NUMSIZE)(%rsp)
+#define blocksize (9*NUMSIZE+8)(%rsp)
+#define table (9*NUMSIZE+16)(%rsp)
+#define i (9*NUMSIZE+24)(%rsp)
+#define bf (9*NUMSIZE+32)(%rsp)
+#define cf (9*NUMSIZE+40)(%rsp)
+#define j (9*NUMSIZE+48)(%rsp)
+
+#define NSPACE (11*NUMSIZE)
+
+S2N_BN_SYMBOL(p256_scalarmulbase_alt):
+        _CET_ENDBR
+
+// The Windows version literally calls the standard ABI version.
+// This simplifies the proofs since subroutine offsets are fixed.
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        movq    %r9, %rcx
+        callq   p256_scalarmulbase_alt_standard
+        popq   %rsi
+        popq   %rdi
+        ret
+
+p256_scalarmulbase_alt_standard:
+#endif
+
+// Real start of the standard ABI code.
+
+        pushq   %r15
+        pushq   %r14
+        pushq   %r13
+        pushq   %r12
+        pushq   %rbp
+        pushq   %rbx
+
+        subq    $NSPACE, %rsp
+
+// Preserve the input arguments except the scalar, since that gets absorbed
+// immediately. The "table" value subsequently gets shifted up each iteration
+// of the loop, while "res" and "blocksize" are static throughout.
+
+        movq    %rdi, res
+        movq    %rdx, blocksize
+        movq    %rcx, table
+
+// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12]
+
+        movq    $0xf3b9cac2fc632551, %r12
+        movq    $0xbce6faada7179e84, %r13
+        movq    $0xffffffffffffffff, %r14
+        movq    $0xffffffff00000000, %r15
+
+// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256
+// Store it to "rscalar" (reduced scalar)
+
+        movq    (%rsi), %r8
+        subq    %r12, %r8
+        movq    8(%rsi), %r9
+        sbbq    %r13, %r9
+        movq    16(%rsi), %r10
+        sbbq    %r14, %r10
+        movq    24(%rsi), %r11
+        sbbq    %r15, %r11
+
+        cmovcq  (%rsi), %r8
+        cmovcq  8(%rsi), %r9
+        cmovcq  16(%rsi), %r10
+        cmovcq  24(%rsi), %r11
+
+        movq    %r8, RSCALAR(%rsp)
+        movq    %r9, RSCALAR+8(%rsp)
+        movq    %r10, RSCALAR+16(%rsp)
+        movq    %r11, RSCALAR+24(%rsp)
+
+// Initialize the accumulator to all zeros and the "carry flag" cf to 0
+
+        xorl    %eax, %eax
+
+        movq    %rax, ACC(%rsp)
+        movq    %rax, ACC+8(%rsp)
+        movq    %rax, ACC+16(%rsp)
+        movq    %rax, ACC+24(%rsp)
+        movq    %rax, ACC+32(%rsp)
+        movq    %rax, ACC+40(%rsp)
+        movq    %rax, ACC+48(%rsp)
+        movq    %rax, ACC+56(%rsp)
+        movq    %rax, ACC+64(%rsp)
+        movq    %rax, ACC+72(%rsp)
+        movq    %rax, ACC+80(%rsp)
+        movq    %rax, ACC+88(%rsp)
+
+        movq    %rax, cf
+
+// Main loop over {i >= 0 | blocksize * i <= 256}. Note the non-strict
+// inequality, to allow top carry for any choices of blocksize.
+
+        movq    %rax, i
+
+p256_scalarmulbase_alt_loop:
+
+// The next raw bitfield is bf = bitfield(blocksize * i,blocksize) + cf,
+// adding in the deferred carry cf. We then shift the whole scalar right
+// by blocksize so we can keep picking bitfield(0,blocksize).
+
+        movq    RSCALAR(%rsp), %r8
+        movq    RSCALAR+8(%rsp), %r9
+        movq    RSCALAR+16(%rsp), %r10
+        movq    RSCALAR+24(%rsp), %r11
+
+        movq    blocksize, %rcx
+        movl    $1, %eax
+        shlq    %cl, %rax
+        decq    %rax
+        andq    %r8, %rax
+
+        shrdq   %cl, %r9, %r8
+        shrdq   %cl, %r10, %r9
+        shrdq   %cl, %r11, %r10
+        shrq    %cl, %r11
+
+        addq    cf, %rax
+        movq    %rax, bf
+
+        movq   %r8, RSCALAR(%rsp)
+        movq   %r9, RSCALAR+8(%rsp)
+        movq   %r10, RSCALAR+16(%rsp)
+        movq   %r11, RSCALAR+24(%rsp)
+
+// Now if bf <= B we just select entry j, unnegated and set cf = 0.
+// If bf > B we set j = 2 * B - bf and negate the j'th entry, setting cf = 1.
+// In either case we ultimately add bf, in the latter case with deferred
+// carry as 2 * B - (2 * B - bf) = bf.
+
+        movl    $1, %eax
+        movq    blocksize, %rcx
+        shlq    %cl, %rax
+        movq    %rax, %rbx
+        shrq    $1, %rax
+
+        subq    bf, %rbx
+        cmpq    bf, %rax
+
+        cmovncq bf, %rbx
+        sbbq    %rax, %rax
+        movq    %rbx, j
+        negq    %rax
+        movq    %rax, cf
+
+// Load table entry j - 1 for nonzero j in constant-time style.
+
+        movq    blocksize, %rcx
+        decq    %rcx
+        movl    $1, %esi
+        shlq    %cl, %rsi
+        movq    j, %r12
+        movq    table, %rbp
+
+p256_scalarmulbase_alt_tabloop:
+        subq    $1, %r12
+        cmovzq  (%rbp), %rax
+        cmovzq  8(%rbp), %rbx
+        cmovzq  16(%rbp), %rcx
+        cmovzq  24(%rbp), %rdx
+        cmovzq  32(%rbp), %r8
+        cmovzq  40(%rbp), %r9
+        cmovzq  48(%rbp), %r10
+        cmovzq  56(%rbp), %r11
+
+        addq    $64, %rbp
+        decq    %rsi
+        jnz     p256_scalarmulbase_alt_tabloop
+
+        movq    %rbp, table
+
+// Before storing back, optionally negate the y coordinate of the table entry
+
+        xorl    %r14d, %r14d
+        leaq    -1(%r14), %r12
+        movq    $0x00000000ffffffff, %r15
+        movq    %r15, %r13
+        negq    %r15
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        movq    %rax, TABENT(%rsp)
+        movq    %rbx, TABENT+8(%rsp)
+        movq    %rcx, TABENT+16(%rsp)
+        movq    %rdx, TABENT+24(%rsp)
+
+        movq    cf, %rax
+        testq   %rax, %rax
+        cmovnzq %r12, %r8
+        cmovnzq %r13, %r9
+        cmovnzq %r14, %r10
+        cmovnzq %r15, %r11
+
+        movq    %r8, TABENT+32(%rsp)
+        movq    %r9, TABENT+40(%rsp)
+        movq    %r10, TABENT+48(%rsp)
+        movq    %r11, TABENT+56(%rsp)
+
+// Add the adjusted table point to the accumulator
+
+        leaq    NACC(%rsp), %rdi
+        leaq    ACC(%rsp), %rsi
+        leaq    TABENT(%rsp), %rdx
+        callq   p256_scalarmulbase_alt_local_p256_montjmixadd
+
+// However, only commit that update to the accumulator if j is nonzero,
+// because the mixed addition function does not handle this case directly,
+// and in any case we didn't choose the table entry appropriately.
+
+        movq    j, %rax
+        testq   %rax, %rax
+
+        movq    ACC(%rsp), %rax
+        cmovnzq NACC(%rsp), %rax
+        movq    %rax, ACC(%rsp)
+
+        movq    ACC+8(%rsp), %rax
+        cmovnzq NACC+8(%rsp), %rax
+        movq    %rax, ACC+8(%rsp)
+
+        movq    ACC+16(%rsp), %rax
+        cmovnzq NACC+16(%rsp), %rax
+        movq    %rax, ACC+16(%rsp)
+
+        movq    ACC+24(%rsp), %rax
+        cmovnzq NACC+24(%rsp), %rax
+        movq    %rax, ACC+24(%rsp)
+
+        movq    ACC+32(%rsp), %rax
+        cmovnzq NACC+32(%rsp), %rax
+        movq    %rax, ACC+32(%rsp)
+
+        movq    ACC+40(%rsp), %rax
+        cmovnzq NACC+40(%rsp), %rax
+        movq    %rax, ACC+40(%rsp)
+
+        movq    ACC+48(%rsp), %rax
+        cmovnzq NACC+48(%rsp), %rax
+        movq    %rax, ACC+48(%rsp)
+
+        movq    ACC+56(%rsp), %rax
+        cmovnzq NACC+56(%rsp), %rax
+        movq    %rax, ACC+56(%rsp)
+
+        movq    ACC+64(%rsp), %rax
+        cmovnzq NACC+64(%rsp), %rax
+        movq    %rax, ACC+64(%rsp)
+
+        movq    ACC+72(%rsp), %rax
+        cmovnzq NACC+72(%rsp), %rax
+        movq    %rax, ACC+72(%rsp)
+
+        movq    ACC+80(%rsp), %rax
+        cmovnzq NACC+80(%rsp), %rax
+        movq    %rax, ACC+80(%rsp)
+
+        movq    ACC+88(%rsp), %rax
+        cmovnzq NACC+88(%rsp), %rax
+        movq    %rax, ACC+88(%rsp)
+
+// Loop while blocksize * i <= 256
+
+        movq    i, %rax
+        incq    %rax
+        movq    %rax, i
+
+        imulq   blocksize, %rax
+        cmpq    $257, %rax
+        jc      p256_scalarmulbase_alt_loop
+
+// That's the end of the main loop, and we just need to translate
+// back from the Jacobian representation to affine. First of all,
+// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form
+
+        leaq    Z2(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        callq   p256_scalarmulbase_alt_local_montsqr_p256
+
+        leaq    Z3(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        leaq    Z2(%rsp), %rdx
+        callq   p256_scalarmulbase_alt_local_montmul_p256
+
+        leaq    Z2(%rsp), %rdi
+        leaq    Z3(%rsp), %rsi
+        callq   p256_scalarmulbase_alt_local_demont_p256
+
+        leaq    Z3(%rsp), %rdi
+        leaq    Z2(%rsp), %rsi
+        callq   p256_scalarmulbase_alt_local_inv_p256
+
+        leaq    Z2(%rsp), %rdi
+        leaq    ACC+64(%rsp), %rsi
+        leaq    Z3(%rsp), %rdx
+        callq   p256_scalarmulbase_alt_local_montmul_p256
+
+// Convert back from Jacobian (X, Y, Z) |-> (X/Z^2, Y/Z^3)
+
+        movq    res, %rdi
+        leaq    ACC(%rsp), %rsi
+        leaq    Z2(%rsp), %rdx
+        movq    %rdi, %rbx
+        callq   p256_scalarmulbase_alt_local_montmul_p256
+
+        leaq    32(%rbx), %rdi
+        leaq    ACC+32(%rsp), %rsi
+        leaq    Z3(%rsp), %rdx
+        callq   p256_scalarmulbase_alt_local_montmul_p256
+
+// Restore stack and registers and return
+
+        addq    $NSPACE, %rsp
+        popq    %rbx
+        popq    %rbp
+        popq    %r12
+        popq    %r13
+        popq    %r14
+        popq    %r15
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+p256_scalarmulbase_alt_local_demont_p256:
+        movq    (%rsi), %r8
+        movq    0x8(%rsi), %r9
+        movq    0x10(%rsi), %r10
+        movq    0x18(%rsi), %r11
+        movabsq $0x100000000, %rcx
+        movq    %r8, %rax
+        mulq    %rcx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rsi, %rsi
+        movq    %r9, %rax
+        mulq    %rcx
+        subq    %rsi, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rsi, %rsi
+        negq    %rcx
+        negq    %rsi
+        incq    %rcx
+        movq    %r8, %rax
+        mulq    %rcx
+        addq    %rax, %r11
+        adcq    %rdx, %rsi
+        sbbq    %r8, %r8
+        negq    %r8
+        movq    %r9, %rax
+        mulq    %rcx
+        addq    %rax, %rsi
+        adcq    %rdx, %r8
+        negq    %rcx
+        incq    %rcx
+        movq    %r10, %rax
+        mulq    %rcx
+        addq    %rax, %r11
+        adcq    %rdx, %rsi
+        sbbq    %r9, %r9
+        movq    %r11, %rax
+        mulq    %rcx
+        subq    %r9, %rdx
+        addq    %rax, %rsi
+        adcq    %rdx, %r8
+        sbbq    %r9, %r9
+        negq    %rcx
+        negq    %r9
+        incq    %rcx
+        movq    %r10, %rax
+        mulq    %rcx
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %r10, %r10
+        negq    %r10
+        movq    %r11, %rax
+        mulq    %rcx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    %rsi, (%rdi)
+        movq    %r8, 0x8(%rdi)
+        movq    %r9, 0x10(%rdi)
+        movq    %r10, 0x18(%rdi)
+        ret
+
+p256_scalarmulbase_alt_local_inv_p256:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xf0, %rsp
+        movq    %rdi, 0xe0(%rsp)
+        xorl    %ecx, %ecx
+        movl    $0xffffffff, %edx
+        movq    %rdx, %rbx
+        leaq    -0x1(%rcx), %rax
+        negq    %rdx
+        movq    %rax, (%rsp)
+        movq    %rbx, 0x8(%rsp)
+        movq    %rcx, 0x10(%rsp)
+        movq    %rdx, 0x18(%rsp)
+        movq    %rcx, 0x20(%rsp)
+        movq    (%rsi), %r8
+        movq    0x8(%rsi), %r9
+        movq    0x10(%rsi), %r10
+        movq    0x18(%rsi), %r11
+        leaq    0x1(%rcx), %rax
+        addq    %r8, %rax
+        leaq    -0x1(%rdx), %rbx
+        adcq    %r9, %rbx
+        notq    %rcx
+        adcq    %r10, %rcx
+        notq    %rdx
+        adcq    %r11, %rdx
+        cmovaeq %r8, %rax
+        cmovaeq %r9, %rbx
+        cmovaeq %r10, %rcx
+        cmovaeq %r11, %rdx
+        movq    %rax, 0x28(%rsp)
+        movq    %rbx, 0x30(%rsp)
+        movq    %rcx, 0x38(%rsp)
+        movq    %rdx, 0x40(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, 0x48(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, 0x50(%rsp)
+        movq    %rax, 0x58(%rsp)
+        movq    %rax, 0x60(%rsp)
+        movq    %rax, 0x68(%rsp)
+        movabsq $0x4000000000000, %rcx
+        movq    %rcx, 0x78(%rsp)
+        movq    %rax, 0x80(%rsp)
+        movq    %rax, 0x88(%rsp)
+        movq    %rax, 0x90(%rsp)
+        movq    $0xa,  0xb0(%rsp)
+        movq    $0x1,  0xb8(%rsp)
+        jmp     p256_scalarmulbase_alt_inv_midloop
+p256_scalarmulbase_alt_inv_loop:
+        movq    %r8, %r9
+        sarq    $0x3f, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        movq    %r10, %r11
+        sarq    $0x3f, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        movq    %r12, %r13
+        sarq    $0x3f, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        movq    %r14, %r15
+        sarq    $0x3f, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %rdi
+        andq    %r11, %rdi
+        addq    %rax, %rdi
+        movq    %rdi, 0xa0(%rsp)
+        movq    %r12, %rax
+        andq    %r13, %rax
+        movq    %r14, %rsi
+        andq    %r15, %rsi
+        addq    %rax, %rsi
+        movq    %rsi, 0xa8(%rsp)
+        xorl    %ebx, %ebx
+        movq    (%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    0x28(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        xorl    %ebp, %ebp
+        movq    (%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x28(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        xorl    %ecx, %ecx
+        movq    0x8(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x30(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        shrdq   $0x3b, %rbx, %rdi
+        movq    %rdi, (%rsp)
+        xorl    %edi, %edi
+        movq    0x8(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        movq    0x30(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        shrdq   $0x3b, %rbp, %rsi
+        movq    %rsi, 0x28(%rsp)
+        xorl    %esi, %esi
+        movq    0x10(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        movq    0x38(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        shrdq   $0x3b, %rcx, %rbx
+        movq    %rbx, 0x8(%rsp)
+        xorl    %ebx, %ebx
+        movq    0x10(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    0x38(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        shrdq   $0x3b, %rdi, %rbp
+        movq    %rbp, 0x30(%rsp)
+        movq    0x18(%rsp), %rax
+        xorq    %r9, %rax
+        movq    0x20(%rsp), %rbp
+        xorq    %r9, %rbp
+        andq    %r8, %rbp
+        negq    %rbp
+        mulq    %r8
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x40(%rsp), %rax
+        xorq    %r11, %rax
+        movq    0x48(%rsp), %rdx
+        xorq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbp
+        mulq    %r10
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        shrdq   $0x3b, %rsi, %rcx
+        movq    %rcx, 0x10(%rsp)
+        shrdq   $0x3b, %rbp, %rsi
+        sarq    $0x3b, %rbp
+        movq    0x18(%rsp), %rax
+        movq    %rsi, 0x18(%rsp)
+        movq    0x20(%rsp), %rsi
+        movq    %rbp, 0x20(%rsp)
+        xorq    %r13, %rax
+        xorq    %r13, %rsi
+        andq    %r12, %rsi
+        negq    %rsi
+        mulq    %r12
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        movq    0x40(%rsp), %rax
+        xorq    %r15, %rax
+        movq    0x48(%rsp), %rdx
+        xorq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rsi
+        mulq    %r14
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        shrdq   $0x3b, %rbx, %rdi
+        movq    %rdi, 0x38(%rsp)
+        shrdq   $0x3b, %rsi, %rbx
+        movq    %rbx, 0x40(%rsp)
+        sarq    $0x3b, %rsi
+        movq    %rsi, 0x48(%rsp)
+        movq    0xa0(%rsp), %rbx
+        movq    0xa8(%rsp), %rbp
+        xorl    %ecx, %ecx
+        movq    0x50(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x78(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        xorl    %esi, %esi
+        movq    0x50(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, 0x50(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    0x78(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, 0x78(%rsp)
+        xorl    %ebx, %ebx
+        movq    0x58(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    0x80(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        xorl    %ebp, %ebp
+        movq    0x58(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rcx, 0x58(%rsp)
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    0x80(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    %rsi, 0x80(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x60(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    0x88(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        xorl    %esi, %esi
+        movq    0x60(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, 0x60(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    0x88(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, 0x88(%rsp)
+        movq    0x68(%rsp), %rax
+        xorq    %r9, %rax
+        movq    %r9, %rbx
+        andq    %r8, %rbx
+        negq    %rbx
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    0x90(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbx
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rbx, %rdx
+        movq    0x68(%rsp), %rax
+        movq    %rcx, 0x68(%rsp)
+        movq    %rdx, 0x70(%rsp)
+        xorq    %r13, %rax
+        movq    %r13, %rcx
+        andq    %r12, %rcx
+        negq    %rcx
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rcx
+        movq    0x90(%rsp), %rax
+        xorq    %r15, %rax
+        movq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rcx
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rcx, %rdx
+        movq    %rsi, 0x90(%rsp)
+        movq    %rdx, 0x98(%rsp)
+        movabsq $0xe000000000000000, %r8
+        addq    0x50(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x58(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x60(%rsp), %r10
+        movabsq $0x2000000000000000, %r11
+        adcq    0x68(%rsp), %r11
+        movabsq $0x1fffffffe0000000, %r12
+        adcq    0x70(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movabsq $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movabsq $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x50(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x58(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x60(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x68(%rsp)
+        movabsq $0xe000000000000000, %r8
+        addq    0x78(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x80(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x88(%rsp), %r10
+        movabsq $0x2000000000000000, %r11
+        adcq    0x90(%rsp), %r11
+        movabsq $0x1fffffffe0000000, %r12
+        adcq    0x98(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movabsq $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movabsq $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x78(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x80(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x88(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x90(%rsp)
+p256_scalarmulbase_alt_inv_midloop:
+        movq    0xb8(%rsp), %rsi
+        movq    (%rsp), %rdx
+        movq    0x28(%rsp), %rcx
+        movq    %rdx, %rbx
+        andq    $0xfffff, %rbx
+        movabsq $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        andq    $0xfffff, %rcx
+        movabsq $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    $0xfffffffffffffffe, %rax
+        xorl    %ebp, %ebp
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %rdx
+        leaq    (%rcx,%rax), %rdi
+        shlq    $0x16, %rdx
+        shlq    $0x16, %rdi
+        sarq    $0x2b, %rdx
+        sarq    $0x2b, %rdi
+        movabsq $0x20000100000, %rax
+        leaq    (%rbx,%rax), %rbx
+        leaq    (%rcx,%rax), %rcx
+        sarq    $0x2a, %rbx
+        sarq    $0x2a, %rcx
+        movq    %rdx, 0xc0(%rsp)
+        movq    %rbx, 0xc8(%rsp)
+        movq    %rdi, 0xd0(%rsp)
+        movq    %rcx, 0xd8(%rsp)
+        movq    (%rsp), %r12
+        imulq   %r12, %rdi
+        imulq   %rdx, %r12
+        movq    0x28(%rsp), %r13
+        imulq   %r13, %rbx
+        imulq   %rcx, %r13
+        addq    %rbx, %r12
+        addq    %rdi, %r13
+        sarq    $0x14, %r12
+        sarq    $0x14, %r13
+        movq    %r12, %rbx
+        andq    $0xfffff, %rbx
+        movabsq $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        movq    %r13, %rcx
+        andq    $0xfffff, %rcx
+        movabsq $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    $0xfffffffffffffffe, %rax
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %r8
+        leaq    (%rcx,%rax), %r10
+        shlq    $0x16, %r8
+        shlq    $0x16, %r10
+        sarq    $0x2b, %r8
+        sarq    $0x2b, %r10
+        movabsq $0x20000100000, %rax
+        leaq    (%rbx,%rax), %r15
+        leaq    (%rcx,%rax), %r11
+        sarq    $0x2a, %r15
+        sarq    $0x2a, %r11
+        movq    %r13, %rbx
+        movq    %r12, %rcx
+        imulq   %r8, %r12
+        imulq   %r15, %rbx
+        addq    %rbx, %r12
+        imulq   %r11, %r13
+        imulq   %r10, %rcx
+        addq    %rcx, %r13
+        sarq    $0x14, %r12
+        sarq    $0x14, %r13
+        movq    %r12, %rbx
+        andq    $0xfffff, %rbx
+        movabsq $0xfffffe0000000000, %rax
+        orq     %rax, %rbx
+        movq    %r13, %rcx
+        andq    $0xfffff, %rcx
+        movabsq $0xc000000000000000, %rax
+        orq     %rax, %rcx
+        movq    0xc0(%rsp), %rax
+        imulq   %r8, %rax
+        movq    0xd0(%rsp), %rdx
+        imulq   %r15, %rdx
+        imulq   0xc8(%rsp), %r8
+        imulq   0xd8(%rsp), %r15
+        addq    %r8, %r15
+        leaq    (%rax,%rdx), %r9
+        movq    0xc0(%rsp), %rax
+        imulq   %r10, %rax
+        movq    0xd0(%rsp), %rdx
+        imulq   %r11, %rdx
+        imulq   0xc8(%rsp), %r10
+        imulq   0xd8(%rsp), %r11
+        addq    %r10, %r11
+        leaq    (%rax,%rdx), %r13
+        movq    $0xfffffffffffffffe, %rax
+        movl    $0x2, %edx
+        movq    %rbx, %rdi
+        movq    %rax, %r8
+        testq   %rsi, %rsi
+        cmovs   %rbp, %r8
+        testq   $0x1, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        cmovs   %rbp, %r8
+        movq    %rbx, %rdi
+        testq   %rdx, %rcx
+        cmoveq  %rbp, %r8
+        cmoveq  %rbp, %rdi
+        sarq    $1, %rcx
+        xorq    %r8, %rdi
+        xorq    %r8, %rsi
+        btq     $0x3f, %r8
+        cmovbq  %rcx, %rbx
+        movq    %rax, %r8
+        subq    %rax, %rsi
+        leaq    (%rcx,%rdi), %rcx
+        sarq    $1, %rcx
+        movl    $0x100000, %eax
+        leaq    (%rbx,%rax), %r8
+        leaq    (%rcx,%rax), %r12
+        shlq    $0x15, %r8
+        shlq    $0x15, %r12
+        sarq    $0x2b, %r8
+        sarq    $0x2b, %r12
+        movabsq $0x20000100000, %rax
+        leaq    (%rbx,%rax), %r10
+        leaq    (%rcx,%rax), %r14
+        sarq    $0x2b, %r10
+        sarq    $0x2b, %r14
+        movq    %r9, %rax
+        imulq   %r8, %rax
+        movq    %r13, %rdx
+        imulq   %r10, %rdx
+        imulq   %r15, %r8
+        imulq   %r11, %r10
+        addq    %r8, %r10
+        leaq    (%rax,%rdx), %r8
+        movq    %r9, %rax
+        imulq   %r12, %rax
+        movq    %r13, %rdx
+        imulq   %r14, %rdx
+        imulq   %r15, %r12
+        imulq   %r11, %r14
+        addq    %r12, %r14
+        leaq    (%rax,%rdx), %r12
+        movq    %rsi, 0xb8(%rsp)
+        decq     0xb0(%rsp)
+        jne     p256_scalarmulbase_alt_inv_loop
+        movq    (%rsp), %rax
+        movq    0x28(%rsp), %rcx
+        imulq   %r8, %rax
+        imulq   %r10, %rcx
+        addq    %rcx, %rax
+        sarq    $0x3f, %rax
+        movq    %r8, %r9
+        sarq    $0x3f, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        xorq    %rax, %r9
+        movq    %r10, %r11
+        sarq    $0x3f, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        xorq    %rax, %r11
+        movq    %r12, %r13
+        sarq    $0x3f, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        xorq    %rax, %r13
+        movq    %r14, %r15
+        sarq    $0x3f, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        xorq    %rax, %r15
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %r12
+        andq    %r11, %r12
+        addq    %rax, %r12
+        xorl    %r13d, %r13d
+        movq    0x50(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        movq    0x78(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movq    0x58(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        movq    0x80(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    0x88(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    0x68(%rsp), %rax
+        xorq    %r9, %rax
+        andq    %r8, %r9
+        negq    %r9
+        mulq    %r8
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    0x90(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %r9
+        mulq    %r10
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    %r12, 0x50(%rsp)
+        movq    %r13, 0x58(%rsp)
+        movq    %r14, 0x60(%rsp)
+        movq    %r15, 0x68(%rsp)
+        movq    %r9, 0x70(%rsp)
+        movabsq $0xe000000000000000, %r8
+        addq    0x50(%rsp), %r8
+        movq    $0xffffffffffffffff, %r9
+        adcq    0x58(%rsp), %r9
+        movq    $0x1fffffff, %r10
+        adcq    0x60(%rsp), %r10
+        movabsq $0x2000000000000000, %r11
+        adcq    0x68(%rsp), %r11
+        movabsq $0x1fffffffe0000000, %r12
+        adcq    0x70(%rsp), %r12
+        movq    %r8, %rbx
+        shlq    $0x20, %rbx
+        movabsq $0xffffffff00000001, %rax
+        mulq    %r8
+        shrq    $0x20, %r8
+        addq    %rbx, %r9
+        adcq    %r8, %r10
+        adcq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rax, %rax
+        movl    $0xffffffff, %ebx
+        andq    %rax, %rbx
+        movabsq $0xffffffff00000001, %rdx
+        andq    %rax, %rdx
+        subq    %rax, %r9
+        movq    %r9, 0x50(%rsp)
+        sbbq    %rbx, %r10
+        movq    %r10, 0x58(%rsp)
+        sbbq    $0x0, %r11
+        movq    %r11, 0x60(%rsp)
+        sbbq    %rdx, %r12
+        movq    %r12, 0x68(%rsp)
+        movq    0x50(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        movq    0x60(%rsp), %r10
+        movq    0x68(%rsp), %r11
+        movl    $0x1, %eax
+        movl    $0xffffffff, %ebx
+        leaq    -0x2(%rax), %rcx
+        leaq    -0x1(%rbx), %rdx
+        notq    %rbx
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+        cmovaeq %r8, %rax
+        cmovaeq %r9, %rbx
+        cmovaeq %r10, %rcx
+        cmovaeq %r11, %rdx
+        movq    0xe0(%rsp), %rdi
+        movq    %rax, (%rdi)
+        movq    %rbx, 0x8(%rdi)
+        movq    %rcx, 0x10(%rdi)
+        movq    %rdx, 0x18(%rdi)
+        addq    $0xf0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+p256_scalarmulbase_alt_local_montmul_p256:
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    %rdx, %rcx
+        movq    (%rcx), %rbx
+        movq    (%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x8(%rcx), %rbx
+        xorl    %r13d, %r13d
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rcx), %rbx
+        xorl    %r15d, %r15d
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rcx), %rbx
+        xorl    %r8d, %r8d
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        ret
+
+p256_scalarmulbase_alt_local_montsqr_p256:
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        movq    (%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x18(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x10(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    (%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x18(%rsi), %rbx
+        movq    0x8(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x8(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x10(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x18(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rdi)
+        movq    %r13, 0x8(%rdi)
+        movq    %r14, 0x10(%rdi)
+        movq    %r15, 0x18(%rdi)
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        ret
+
+p256_scalarmulbase_alt_local_p256_montjmixadd:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xc0, %rsp
+        movq    %rdx, %rbp
+        movq    0x40(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rsi), %rbx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x20(%rbp), %rbx
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rbp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rbp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rbp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x50(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x58(%rsi), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    0x0(%rbp), %rbx
+        movq    (%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x8(%rbp), %rbx
+        xorl    %r13d, %r13d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rbp), %rbx
+        xorl    %r15d, %r15d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rbp), %rbx
+        xorl    %r8d, %r8d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    0x20(%rsp), %rbx
+        movq    (%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    (%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x10(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x18(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    (%rsi), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x8(%rsi), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x10(%rsi), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x18(%rsi), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0xa0(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0xa8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0xb0(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0xb8(%rsp)
+        movq    0x20(%rsp), %rax
+        subq    0x20(%rsi), %rax
+        movq    0x28(%rsp), %rcx
+        sbbq    0x28(%rsi), %rcx
+        movq    0x30(%rsp), %r8
+        sbbq    0x30(%rsi), %r8
+        movq    0x38(%rsp), %r9
+        sbbq    0x38(%rsi), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x20(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x28(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x30(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x38(%rsp)
+        movq    0xa0(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0xb8(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0xb0(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0xb8(%rsp), %rbx
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0xa8(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0xb0(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0xb8(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x20(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x38(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x30(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x38(%rsp), %rbx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x30(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x38(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rcx, %rcx
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        xorl    %r8d, %r8d
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r8, %r14
+        adcq    %r8, %r15
+        adcq    %r8, %r8
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        xorl    %r9d, %r9d
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        leaq    -0x1(%rbx), %rbx
+        adcq    %r13, %rbx
+        leaq    -0x1(%r9), %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    (%rsi), %rbx
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x8(%rsi), %rbx
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rsi), %rbx
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rsi), %rbx
+        xorl    %r8d, %r8d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x40(%rsp), %rbx
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x48(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    (%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        movq    0x40(%rsi), %rbx
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x48(%rsi), %rbx
+        xorl    %r13d, %r13d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rsi), %rbx
+        xorl    %r15d, %r15d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rsi), %rbx
+        xorl    %r8d, %r8d
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0xb0(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0xb8(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    (%rsp), %rax
+        subq    0x40(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x48(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x50(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x58(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x20(%rsi), %rbx
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x28(%rsi), %rbx
+        xorl    %r13d, %r13d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rsi), %rbx
+        xorl    %r15d, %r15d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rsi), %rbx
+        xorl    %r8d, %r8d
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x70(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x78(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x80(%rsp), %rbx
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        xorl    %r10d, %r10d
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        xorl    %r11d, %r11d
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        xorl    %r12d, %r12d
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movq    0x88(%rsp), %rbx
+        xorl    %r13d, %r13d
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r14, %r14
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r14, %r14
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r14, %r14
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        subq    %r14, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        xorl    %r14d, %r14d
+        movabsq $0x100000000, %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r8, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r15, %r15
+        movq    %r9, %rax
+        mulq    %rbx
+        subq    %r15, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x90(%rsp), %rbx
+        xorl    %r15d, %r15d
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r8, %r8
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r8, %r8
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r8, %r8
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        subq    %r8, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x98(%rsp), %rbx
+        xorl    %r8d, %r8d
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %r9, %r9
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r9, %r9
+        movq    0x30(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %r9, %r9
+        movq    0x38(%rsp), %rax
+        mulq    %rbx
+        subq    %r9, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r8, %r8
+        xorl    %r9d, %r9d
+        movabsq $0x100000000, %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %rcx, %rcx
+        notq    %rbx
+        leaq    0x2(%rbx), %rbx
+        movq    %r10, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        sbbq    %rcx, %rcx
+        movq    %r11, %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        adcq    %r9, %r8
+        movl    $0x1, %ecx
+        addq    %r12, %rcx
+        decq    %rbx
+        adcq    %r13, %rbx
+        decq    %r9
+        movq    %r9, %rax
+        adcq    %r14, %r9
+        movl    $0xfffffffe, %r11d
+        adcq    %r15, %r11
+        adcq    %r8, %rax
+        cmovbq  %rcx, %r12
+        cmovbq  %rbx, %r13
+        cmovbq  %r9, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    0x60(%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x68(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x70(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x78(%rsp), %r9
+        movl    $0xffffffff, %r10d
+        sbbq    %r11, %r11
+        xorq    %rdx, %rdx
+        andq    %r11, %r10
+        subq    %r10, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    $0x0, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x40(%rsi), %rax
+        movq    0x48(%rsi), %rdx
+        orq     0x50(%rsi), %rax
+        orq     0x58(%rsi), %rdx
+        orq     %rdx, %rax
+        movq    (%rsp), %r8
+        movq    0x0(%rbp), %rax
+        cmoveq  %rax, %r8
+        movq    0x8(%rsp), %r9
+        movq    0x8(%rbp), %rax
+        cmoveq  %rax, %r9
+        movq    0x10(%rsp), %r10
+        movq    0x10(%rbp), %rax
+        cmoveq  %rax, %r10
+        movq    0x18(%rsp), %r11
+        movq    0x18(%rbp), %rax
+        cmoveq  %rax, %r11
+        movq    0x80(%rsp), %r12
+        movq    0x20(%rbp), %rax
+        cmoveq  %rax, %r12
+        movq    0x88(%rsp), %r13
+        movq    0x28(%rbp), %rax
+        cmoveq  %rax, %r13
+        movq    0x90(%rsp), %r14
+        movq    0x30(%rbp), %rax
+        cmoveq  %rax, %r14
+        movq    0x98(%rsp), %r15
+        movq    0x38(%rbp), %rax
+        cmoveq  %rax, %r15
+        movq    %r8, (%rdi)
+        movq    %r9, 0x8(%rdi)
+        movq    %r10, 0x10(%rdi)
+        movq    %r11, 0x18(%rdi)
+        movq    %r12, 0x20(%rdi)
+        movq    %r13, 0x28(%rdi)
+        movq    %r14, 0x30(%rdi)
+        movq    %r15, 0x38(%rdi)
+        movq    0xa0(%rsp), %r8
+        movq    0xa8(%rsp), %r9
+        movq    0xb0(%rsp), %r10
+        movq    0xb8(%rsp), %r11
+        movl    $0x1, %eax
+        cmoveq  %rax, %r8
+        movabsq $0xffffffff00000000, %rax
+        cmoveq  %rax, %r9
+        movq    $0xffffffffffffffff, %rax
+        cmoveq  %rax, %r10
+        movl    $0xfffffffe, %eax
+        cmoveq  %rax, %r11
+        movq    %r8, 0x40(%rdi)
+        movq    %r9, 0x48(%rdi)
+        movq    %r10, 0x50(%rdi)
+        movq    %r11, 0x58(%rdi)
+        addq    $0xc0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_add_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_add_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_add_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_add_p384.S
index 94293e4e703..4d7387f1809 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_add_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_add_p384.S
@@ -41,6 +41,7 @@
 
 
 S2N_BN_SYMBOL(bignum_add_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_bigendian_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_bigendian_6.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_bigendian_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_bigendian_6.S
index 0a23e35659f..59d417b9cdf 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_bigendian_6.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_bigendian_6.S
@@ -50,6 +50,7 @@
 S2N_BN_SYMBOL(bignum_bigendian_6):
 S2N_BN_SYMBOL(bignum_frombebytes_6):
 S2N_BN_SYMBOL(bignum_tobebytes_6):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384.S
index 76f67950872..1d9ff617379 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384.S
@@ -45,6 +45,7 @@
 
 
 S2N_BN_SYMBOL(bignum_cmul_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384_alt.S
index 2e21e646150..f5b78addcec 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384_alt.S
@@ -49,6 +49,7 @@
 #define qshort %ecx
 
 S2N_BN_SYMBOL(bignum_cmul_p384_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384.S
index 9edb4ab6108..5b02af4252a 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384.S
@@ -71,6 +71,7 @@
         sbbq    $0, d6
 
 S2N_BN_SYMBOL(bignum_deamont_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384_alt.S
index c0e6096bdd2..2c97a83a884 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384_alt.S
@@ -71,6 +71,7 @@
         sbbq    $0, d6
 
 S2N_BN_SYMBOL(bignum_deamont_p384_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_demont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_demont_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384.S
index 36a5ef0078f..0ad5d43248a 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_demont_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384.S
@@ -63,6 +63,7 @@
         sbbq    $0, d6
 
 S2N_BN_SYMBOL(bignum_demont_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_demont_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_demont_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384_alt.S
index adccd962e70..dafc219c17d 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_demont_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384_alt.S
@@ -63,6 +63,7 @@
         sbbq    $0, d6
 
 S2N_BN_SYMBOL(bignum_demont_p384_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_double_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_double_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_double_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_double_p384.S
index 7e0c35dab37..1afc75482d8 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_double_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_double_p384.S
@@ -39,6 +39,7 @@
 
 
 S2N_BN_SYMBOL(bignum_double_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_half_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_half_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_half_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_half_p384.S
index a3e39541739..629dcedd5c9 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_half_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_half_p384.S
@@ -36,6 +36,7 @@
 
 
 S2N_BN_SYMBOL(bignum_half_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_inv_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_inv_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_inv_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_inv_p384.S
index e1dfecfa2ea..2acba946089 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_inv_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_inv_p384.S
@@ -1041,6 +1041,7 @@
         leaq    (%rax,%rdx), %r12
 
 S2N_BN_SYMBOL(bignum_inv_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
@@ -1140,9 +1141,9 @@ S2N_BN_SYMBOL(bignum_inv_p384):
 
         movq    $15, i
         movq    $1, d
-        jmp     midloop
+        jmp     bignum_inv_p384_midloop
 
-loop:
+bignum_inv_p384_loop:
 
 // Separate out the matrix into sign-magnitude pairs
 
@@ -1587,7 +1588,7 @@ loop:
 
         amontred(v)
 
-midloop:
+bignum_inv_p384_midloop:
 
         divstep59(d,ff,gg)
         movq    %rsi, d
@@ -1595,7 +1596,7 @@ midloop:
 // Next iteration
 
         decq    i
-        jnz     loop
+        jnz     bignum_inv_p384_loop
 
 // The 15th and last iteration does not need anything except the
 // u value and the sign of f; the latter can be obtained from the
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_littleendian_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_littleendian_6.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_littleendian_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_littleendian_6.S
index fe5744a86ef..d4110ef56e0 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_littleendian_6.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_littleendian_6.S
@@ -42,6 +42,7 @@
 S2N_BN_SYMBOL(bignum_littleendian_6):
 S2N_BN_SYMBOL(bignum_fromlebytes_6):
 S2N_BN_SYMBOL(bignum_tolebytes_6):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384.S
similarity index 91%
rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_n384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384.S
index 169a136ea32..4914f5a1769 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384.S
@@ -43,6 +43,7 @@
 
 
 S2N_BN_SYMBOL(bignum_mod_n384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
@@ -62,7 +63,7 @@ S2N_BN_SYMBOL(bignum_mod_n384):
 // If the input is already <= 5 words long, go to a trivial "copy" path
 
         cmpq    $6, k
-        jc      shortinput
+        jc      bignum_mod_n384_shortinput
 
 // Otherwise load the top 6 digits (top-down) and reduce k by 6
 
@@ -105,9 +106,9 @@ S2N_BN_SYMBOL(bignum_mod_n384):
 // Now do (k-6) iterations of 7->6 word modular reduction
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_n384_writeback
 
-loop:
+bignum_mod_n384_loop:
 
 // Compute q = min (m5 + 1) (2^64 - 1)
 
@@ -170,11 +171,11 @@ loop:
         movq    d, m0
 
         decq    k
-        jnz     loop
+        jnz     bignum_mod_n384_loop
 
 // Write back
 
-writeback:
+bignum_mod_n384_writeback:
 
         movq    m0, (z)
         movq    m1, 8(z)
@@ -195,7 +196,7 @@ writeback:
 #endif
         ret
 
-shortinput:
+bignum_mod_n384_shortinput:
 
         xorq    m0, m0
         xorq    m1, m1
@@ -205,21 +206,21 @@ shortinput:
         xorq    m5, m5
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_n384_writeback
         movq    (%rdx), m0
         decq    k
-        jz      writeback
+        jz      bignum_mod_n384_writeback
         movq    8(%rdx), m1
         decq    k
-        jz      writeback
+        jz      bignum_mod_n384_writeback
         movq    16(%rdx), m2
         decq    k
-        jz      writeback
+        jz      bignum_mod_n384_writeback
         movq    24(%rdx), m3
         decq    k
-        jz      writeback
+        jz      bignum_mod_n384_writeback
         movq    32(%rdx), m4
-        jmp     writeback
+        jmp     bignum_mod_n384_writeback
 
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_6.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_6.S
index 6b68c2a4445..2daf1fce8c3 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_6.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_6.S
@@ -40,6 +40,7 @@
 
 
 S2N_BN_SYMBOL(bignum_mod_n384_6):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_alt.S
similarity index 90%
rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_alt.S
index 92282a83a7e..f25bea3f779 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_alt.S
@@ -44,6 +44,7 @@
 #define qshort %ebp
 
 S2N_BN_SYMBOL(bignum_mod_n384_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
@@ -64,7 +65,7 @@ S2N_BN_SYMBOL(bignum_mod_n384_alt):
 // If the input is already <= 5 words long, go to a trivial "copy" path
 
         cmpq    $6, k
-        jc      shortinput
+        jc      bignum_mod_n384_alt_shortinput
 
 // Otherwise load the top 6 digits (top-down) and reduce k by 6
 
@@ -107,9 +108,9 @@ S2N_BN_SYMBOL(bignum_mod_n384_alt):
 // Now do (k-6) iterations of 7->6 word modular reduction
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_n384_alt_writeback
 
-loop:
+bignum_mod_n384_alt_loop:
 
 // Compute q = min (m5 + 1) (2^64 - 1)
 
@@ -173,11 +174,11 @@ loop:
         movq    d, m0
 
         decq    k
-        jnz     loop
+        jnz     bignum_mod_n384_alt_loop
 
 // Write back
 
-writeback:
+bignum_mod_n384_alt_writeback:
 
         movq    m0, (z)
         movq    m1, 8(z)
@@ -199,7 +200,7 @@ writeback:
 #endif
         ret
 
-shortinput:
+bignum_mod_n384_alt_shortinput:
 
         xorq    m0, m0
         xorq    m1, m1
@@ -209,21 +210,21 @@ shortinput:
         xorq    m5, m5
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_n384_alt_writeback
         movq    (%rdx), m0
         decq    k
-        jz      writeback
+        jz      bignum_mod_n384_alt_writeback
         movq    8(%rdx), m1
         decq    k
-        jz      writeback
+        jz      bignum_mod_n384_alt_writeback
         movq    16(%rdx), m2
         decq    k
-        jz      writeback
+        jz      bignum_mod_n384_alt_writeback
         movq    24(%rdx), m3
         decq    k
-        jz      writeback
+        jz      bignum_mod_n384_alt_writeback
         movq    32(%rdx), m4
-        jmp     writeback
+        jmp     bignum_mod_n384_alt_writeback
 
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384.S
similarity index 91%
rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384.S
index c9caf41c83d..69418ecd5bf 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384.S
@@ -42,6 +42,7 @@
 
 
 S2N_BN_SYMBOL(bignum_mod_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
@@ -61,7 +62,7 @@ S2N_BN_SYMBOL(bignum_mod_p384):
 // If the input is already <= 5 words long, go to a trivial "copy" path
 
         cmpq    $6, k
-        jc      shortinput
+        jc      bignum_mod_p384_shortinput
 
 // Otherwise load the top 6 digits (top-down) and reduce k by 6
 
@@ -104,9 +105,9 @@ S2N_BN_SYMBOL(bignum_mod_p384):
 // Now do (k-6) iterations of 7->6 word modular reduction
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_p384_writeback
 
-loop:
+bignum_mod_p384_loop:
 
 // Compute q = min (m5 + 1) (2^64 - 1)
 
@@ -169,11 +170,11 @@ loop:
         movq    d, m0
 
         decq    k
-        jnz     loop
+        jnz     bignum_mod_p384_loop
 
 // Write back
 
-writeback:
+bignum_mod_p384_writeback:
 
         movq    m0, (z)
         movq    m1, 8(z)
@@ -194,7 +195,7 @@ writeback:
 #endif
         ret
 
-shortinput:
+bignum_mod_p384_shortinput:
 
         xorq    m0, m0
         xorq    m1, m1
@@ -204,21 +205,21 @@ shortinput:
         xorq    m5, m5
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_p384_writeback
         movq    (%rdx), m0
         decq    k
-        jz      writeback
+        jz      bignum_mod_p384_writeback
         movq    8(%rdx), m1
         decq    k
-        jz      writeback
+        jz      bignum_mod_p384_writeback
         movq    16(%rdx), m2
         decq    k
-        jz      writeback
+        jz      bignum_mod_p384_writeback
         movq    24(%rdx), m3
         decq    k
-        jz      writeback
+        jz      bignum_mod_p384_writeback
         movq    32(%rdx), m4
-        jmp     writeback
+        jmp     bignum_mod_p384_writeback
 
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_6.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_6.S
index 7196a76f314..2c27d82ccc4 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_6.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_6.S
@@ -39,6 +39,7 @@
 
 
 S2N_BN_SYMBOL(bignum_mod_p384_6):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_alt.S
similarity index 90%
rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_alt.S
index 79da7842a62..16f54defb30 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_alt.S
@@ -46,6 +46,7 @@
 
 
 S2N_BN_SYMBOL(bignum_mod_p384_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
@@ -65,7 +66,7 @@ S2N_BN_SYMBOL(bignum_mod_p384_alt):
 // If the input is already <= 5 words long, go to a trivial "copy" path
 
         cmpq    $6, k
-        jc      shortinput
+        jc      bignum_mod_p384_alt_shortinput
 
 // Otherwise load the top 6 digits (top-down) and reduce k by 6
 
@@ -108,9 +109,9 @@ S2N_BN_SYMBOL(bignum_mod_p384_alt):
 // Now do (k-6) iterations of 7->6 word modular reduction
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_p384_alt_writeback
 
-loop:
+bignum_mod_p384_alt_loop:
 
 // Compute q = min (m5 + 1) (2^64 - 1)
 
@@ -173,11 +174,11 @@ loop:
         movq    d, m0
 
         decq    k
-        jnz     loop
+        jnz     bignum_mod_p384_alt_loop
 
 // Write back
 
-writeback:
+bignum_mod_p384_alt_writeback:
 
         movq    m0, (z)
         movq    m1, 8(z)
@@ -198,7 +199,7 @@ writeback:
 #endif
         ret
 
-shortinput:
+bignum_mod_p384_alt_shortinput:
 
         xorq    m0, m0
         xorq    m1, m1
@@ -208,21 +209,21 @@ shortinput:
         xorq    m5, m5
 
         testq   k, k
-        jz      writeback
+        jz      bignum_mod_p384_alt_writeback
         movq    (%rdx), m0
         decq    k
-        jz      writeback
+        jz      bignum_mod_p384_alt_writeback
         movq    8(%rdx), m1
         decq    k
-        jz      writeback
+        jz      bignum_mod_p384_alt_writeback
         movq    16(%rdx), m2
         decq    k
-        jz      writeback
+        jz      bignum_mod_p384_alt_writeback
         movq    24(%rdx), m3
         decq    k
-        jz      writeback
+        jz      bignum_mod_p384_alt_writeback
         movq    32(%rdx), m4
-        jmp     writeback
+        jmp     bignum_mod_p384_alt_writeback
 
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montinv_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montinv_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_montinv_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montinv_p384.S
index 81928ed59dc..b85c917793e 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_montinv_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montinv_p384.S
@@ -1046,6 +1046,7 @@
         leaq    (%rax,%rdx), %r12
 
 S2N_BN_SYMBOL(bignum_montinv_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384.S
index 105efac6109..b11d91efd1c 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384.S
@@ -86,6 +86,7 @@
         adcq    $0, d7
 
 S2N_BN_SYMBOL(bignum_montmul_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384_alt.S
index 5a8b4905d92..4d8b14a2a5b 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384_alt.S
@@ -108,6 +108,7 @@
         adcq    $0, d7
 
 S2N_BN_SYMBOL(bignum_montmul_p384_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384.S
index 0d0b36013ab..b71edd98017 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384.S
@@ -83,6 +83,7 @@
         sbbq    $0, d6
 
 S2N_BN_SYMBOL(bignum_montsqr_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384_alt.S
index 061ef6181d3..e00a162ae77 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384_alt.S
@@ -105,6 +105,7 @@
         sbbq    $0, d6
 
 S2N_BN_SYMBOL(bignum_montsqr_p384_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mux_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mux_6.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_mux_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mux_6.S
index cb4c2ca503c..e4890132381 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_mux_6.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mux_6.S
@@ -31,6 +31,7 @@
 
 
 S2N_BN_SYMBOL(bignum_mux_6):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_neg_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_neg_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_neg_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_neg_p384.S
index 746c01286a2..edefc39c899 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_neg_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_neg_p384.S
@@ -31,6 +31,7 @@
 #define n0short %eax
 
 S2N_BN_SYMBOL(bignum_neg_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_nonzero_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_nonzero_6.S
similarity index 98%
rename from third_party/s2n-bignum/x86_att/p384/bignum_nonzero_6.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_nonzero_6.S
index 7fdb6bab060..c55511452e1 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_nonzero_6.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_nonzero_6.S
@@ -26,6 +26,7 @@
 
 
 S2N_BN_SYMBOL(bignum_nonzero_6):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_optneg_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_optneg_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_optneg_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_optneg_p384.S
index 0a8b247e5dc..af11ff1c1b1 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_optneg_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_optneg_p384.S
@@ -34,6 +34,7 @@
 #define n0short %eax
 
 S2N_BN_SYMBOL(bignum_optneg_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_sub_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_sub_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_sub_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_sub_p384.S
index 5914f4ae9cf..c4f617386bf 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_sub_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_sub_p384.S
@@ -40,6 +40,7 @@
 
 
 S2N_BN_SYMBOL(bignum_sub_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384.S
index 66503a2ec40..326bf327e16 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384.S
@@ -85,6 +85,7 @@
         adcq    $0, d7
 
 S2N_BN_SYMBOL(bignum_tomont_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384_alt.S
index 725713d3410..3aaa2a18cdc 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384_alt.S
@@ -103,6 +103,7 @@
         adcq    $0, d7
 
 S2N_BN_SYMBOL(bignum_tomont_p384_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_triple_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_triple_p384.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384.S
index 52b70f6bea4..417bf465342 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_triple_p384.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384.S
@@ -40,6 +40,7 @@
 #define qshort %edx
 
 S2N_BN_SYMBOL(bignum_triple_p384):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_triple_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/bignum_triple_p384_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384_alt.S
index bdbf7e8f6d7..a48c8848890 100644
--- a/third_party/s2n-bignum/x86_att/p384/bignum_triple_p384_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384_alt.S
@@ -43,6 +43,7 @@
 #define dshort %edx
 
 S2N_BN_SYMBOL(bignum_triple_p384_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/p384_montjadd.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd.S
index 60780822043..23e12ed5e3b 100644
--- a/third_party/s2n-bignum/x86_att/p384/p384_montjadd.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd.S
@@ -893,6 +893,7 @@
         cmovnbe 40+P2, r5
 
 S2N_BN_SYMBOL(p384_montjadd):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/p384_montjadd_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd_alt.S
index e36a60f331a..d16b163d338 100644
--- a/third_party/s2n-bignum/x86_att/p384/p384_montjadd_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd_alt.S
@@ -818,6 +818,7 @@
         cmovnbe 40+P2, r5
 
 S2N_BN_SYMBOL(p384_montjadd_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/p384_montjdouble.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble.S
index b51d24f9317..3be512a1338 100644
--- a/third_party/s2n-bignum/x86_att/p384/p384_montjdouble.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble.S
@@ -904,6 +904,7 @@
         movq   %r13, 0x28+P0
 
 S2N_BN_SYMBOL(p384_montjdouble):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/p384_montjdouble_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble_alt.S
index 8258e352674..5a2e397ac58 100644
--- a/third_party/s2n-bignum/x86_att/p384/p384_montjdouble_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble_alt.S
@@ -1098,6 +1098,7 @@
         movq    %r13, 0x28+P0
 
 S2N_BN_SYMBOL(p384_montjdouble_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd.S
index 539a28117a8..284c4577190 100644
--- a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd.S
@@ -886,6 +886,7 @@
         movq    r5, 40+P
 
 S2N_BN_SYMBOL(p384_montjmixadd):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/p384_montjmixadd_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd_alt.S
index da610ee88eb..df8a7533297 100644
--- a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd_alt.S
@@ -811,6 +811,7 @@
         movq    r5, 40+P
 
 S2N_BN_SYMBOL(p384_montjmixadd_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul.S
index 38bea41d878..1b1445e75ec 100644
--- a/third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul.S
@@ -87,6 +87,7 @@
         cmovzq  TAB+JACSIZE*(I-1)+88(%rsp), %r9
 
 S2N_BN_SYMBOL(p384_montjscalarmul):
+        _CET_ENDBR
 
 // The Windows version literally calls the standard ABI version.
 // This simplifies the proofs since subroutine offsets are fixed.
diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul_alt.S
index c666db6dbe9..07fd39d4b1d 100644
--- a/third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul_alt.S
@@ -87,6 +87,7 @@
         cmovzq  TAB+JACSIZE*(I-1)+88(%rsp), %r9
 
 S2N_BN_SYMBOL(p384_montjscalarmul_alt):
+        _CET_ENDBR
 
 // The Windows version literally calls the standard ABI version.
 // This simplifies the proofs since subroutine offsets are fixed.
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_add_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_add_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_add_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_add_p521.S
index b046828d458..430c382a628 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_add_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_add_p521.S
@@ -40,6 +40,7 @@
 
 
 S2N_BN_SYMBOL(bignum_add_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521.S
index fbfc3063fd4..97695288c7d 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521.S
@@ -52,6 +52,7 @@
 #define h d9
 
 S2N_BN_SYMBOL(bignum_cmul_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521_alt.S
index fd6986f232a..794193ef97e 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521_alt.S
@@ -56,6 +56,7 @@
 #define h d9
 
 S2N_BN_SYMBOL(bignum_cmul_p521_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_deamont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_deamont_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_deamont_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_deamont_p521.S
index 099c0e33fcf..f3ebc44c6a5 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_deamont_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_deamont_p521.S
@@ -40,6 +40,7 @@
 #define d8 %rbp
 
 S2N_BN_SYMBOL(bignum_deamont_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_demont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_demont_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_demont_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_demont_p521.S
index ef83448b156..8796752852d 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_demont_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_demont_p521.S
@@ -40,6 +40,7 @@
 #define d8 %rdx
 
 S2N_BN_SYMBOL(bignum_demont_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_double_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_double_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_double_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_double_p521.S
index 9322ec0b1a5..d5f091669e7 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_double_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_double_p521.S
@@ -28,6 +28,7 @@
 
 
 S2N_BN_SYMBOL(bignum_double_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_fromlebytes_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_fromlebytes_p521.S
similarity index 98%
rename from third_party/s2n-bignum/x86_att/p521/bignum_fromlebytes_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_fromlebytes_p521.S
index 6a80dce3c22..907de58755d 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_fromlebytes_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_fromlebytes_p521.S
@@ -28,6 +28,7 @@
 #define a %rax
 
 S2N_BN_SYMBOL(bignum_fromlebytes_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_half_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_half_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_half_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_half_p521.S
index ee8b91a325a..c974d995679 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_half_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_half_p521.S
@@ -39,6 +39,7 @@
 
 
 S2N_BN_SYMBOL(bignum_half_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_inv_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_inv_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_inv_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_inv_p521.S
index a23dbc56b43..51eb6edd9a4 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_inv_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_inv_p521.S
@@ -966,6 +966,7 @@
         leaq    (%rax,%rdx), %r12
 
 S2N_BN_SYMBOL(bignum_inv_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9.S
index c7e33f88fd3..9407283a86d 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9.S
@@ -40,6 +40,7 @@
 #define qshort %edx
 
 S2N_BN_SYMBOL(bignum_mod_n521_9):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9_alt.S
index aeb314691ab..0ecaa4595f6 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9_alt.S
@@ -40,6 +40,7 @@
 #define qshort %edx
 
 S2N_BN_SYMBOL(bignum_mod_n521_9_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mod_p521_9.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_p521_9.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_mod_p521_9.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_p521_9.S
index 0d67aa3ee26..7de12dd98bd 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_mod_p521_9.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_p521_9.S
@@ -39,6 +39,7 @@
 #define d7 %rsi
 
 S2N_BN_SYMBOL(bignum_mod_p521_9):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521.S
index 21d777a655c..4c19463d27d 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521.S
@@ -41,6 +41,7 @@
         adoxq   %rbx, high
 
 S2N_BN_SYMBOL(bignum_montmul_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521_alt.S
index b3d0d7c2c67..eea6ac1ce72 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521_alt.S
@@ -58,6 +58,7 @@
         adcq    %rdx, h
 
 S2N_BN_SYMBOL(bignum_montmul_p521_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521.S
index ede53c627cf..d9b26f3ec92 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521.S
@@ -52,6 +52,7 @@
         adoxq   zero, high
 
 S2N_BN_SYMBOL(bignum_montsqr_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521_alt.S
index dccdc33ef5d..f20a99698f7 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521_alt.S
@@ -87,6 +87,7 @@
         adcq    $0, c
 
 S2N_BN_SYMBOL(bignum_montsqr_p521_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_mul_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521.S
index f96e8417ab8..19680ee6f42 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521.S
@@ -36,6 +36,7 @@
         adoxq   %rbx, high
 
 S2N_BN_SYMBOL(bignum_mul_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521_alt.S
index a769fa0b3a8..e4488eae349 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521_alt.S
@@ -53,6 +53,7 @@
         adcq    %rdx, h
 
 S2N_BN_SYMBOL(bignum_mul_p521_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_neg_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_neg_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_neg_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_neg_p521.S
index 9a130b0b304..9128da8a0a9 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_neg_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_neg_p521.S
@@ -30,6 +30,7 @@
 #define d5 %r11
 
 S2N_BN_SYMBOL(bignum_neg_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_optneg_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_optneg_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_optneg_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_optneg_p521.S
index 8f4c740b6bc..95661fe0c24 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_optneg_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_optneg_p521.S
@@ -32,6 +32,7 @@
 #define d4 %r11
 
 S2N_BN_SYMBOL(bignum_optneg_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521.S
index 4b4748f1064..e300799b9c3 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521.S
@@ -46,6 +46,7 @@
         adoxq   zero, high
 
 S2N_BN_SYMBOL(bignum_sqr_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521_alt.S
index 475d3d3c812..4ae40c1e5a8 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521_alt.S
@@ -81,6 +81,7 @@
         adcq    $0, c
 
 S2N_BN_SYMBOL(bignum_sqr_p521_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_sub_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sub_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_sub_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sub_p521.S
index 03db019833e..c4ea3d31509 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_sub_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sub_p521.S
@@ -39,6 +39,7 @@
 
 
 S2N_BN_SYMBOL(bignum_sub_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_tolebytes_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tolebytes_p521.S
similarity index 98%
rename from third_party/s2n-bignum/x86_att/p521/bignum_tolebytes_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tolebytes_p521.S
index 7f891725690..077dcc51fa8 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_tolebytes_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tolebytes_p521.S
@@ -28,6 +28,7 @@
 #define a %rax
 
 S2N_BN_SYMBOL(bignum_tolebytes_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_tomont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tomont_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_tomont_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tomont_p521.S
index 39983c24bae..c5c678a0202 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_tomont_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tomont_p521.S
@@ -39,6 +39,7 @@
 #define d7 %rsi
 
 S2N_BN_SYMBOL(bignum_tomont_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_triple_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_triple_p521.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521.S
index 264481ef181..aad7a2da098 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_triple_p521.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521.S
@@ -40,6 +40,7 @@
 
 
 S2N_BN_SYMBOL(bignum_triple_p521):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_triple_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/bignum_triple_p521_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521_alt.S
index ecd07987788..a12c9b1c5c8 100644
--- a/third_party/s2n-bignum/x86_att/p521/bignum_triple_p521_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521_alt.S
@@ -41,6 +41,7 @@
 #define d %rdx
 
 S2N_BN_SYMBOL(bignum_triple_p521_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/p521_jadd.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd.S
index 9f1b03c47bd..36a856647ce 100644
--- a/third_party/s2n-bignum/x86_att/p521/p521_jadd.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd.S
@@ -747,6 +747,7 @@
         movq    %rax, 64+P0
 
 S2N_BN_SYMBOL(p521_jadd):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/p521_jadd_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd_alt.S
index 5b51a4f6a62..75571f21fbf 100644
--- a/third_party/s2n-bignum/x86_att/p521/p521_jadd_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd_alt.S
@@ -1023,6 +1023,7 @@
         movq    %rax, 64+P0
 
 S2N_BN_SYMBOL(p521_jadd_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/p521_jdouble.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble.S
index 22ccbebd433..e0e40bfcc35 100644
--- a/third_party/s2n-bignum/x86_att/p521/p521_jdouble.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble.S
@@ -1286,6 +1286,7 @@
         movq    %rbx, 64+P0
 
 S2N_BN_SYMBOL(p521_jdouble):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/p521_jdouble_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble_alt.S
index 2dc6c321201..a420a2ce0ab 100644
--- a/third_party/s2n-bignum/x86_att/p521/p521_jdouble_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble_alt.S
@@ -1775,6 +1775,7 @@
         movq    %rbx, 64+P0
 
 S2N_BN_SYMBOL(p521_jdouble_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd.S
index 879fce6954f..fd9111224b9 100644
--- a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd.S
@@ -776,6 +776,7 @@
         movq    %rax, 64+P0
 
 S2N_BN_SYMBOL(p521_jmixadd):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/p521_jmixadd_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd_alt.S
index d9279fe3054..9da04d0588e 100644
--- a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd_alt.S
@@ -1052,6 +1052,7 @@
         movq    %rax, 64+P0
 
 S2N_BN_SYMBOL(p521_jmixadd_alt):
+        _CET_ENDBR
 
 #if WINDOWS_ABI
         pushq   %rdi
diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/p521_jscalarmul.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul.S
index 905c32a76d3..d0f73f8b423 100644
--- a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul.S
@@ -73,6 +73,7 @@
         cmovzq  TAB+JACSIZE*(I-1)+64+C*NUMSIZE(%rsp), %r12
 
 S2N_BN_SYMBOL(p521_jscalarmul):
+        _CET_ENDBR
 
 // The Windows version literally calls the standard ABI version.
 // This simplifies the proofs since subroutine offsets are fixed.
diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul_alt.S
similarity index 99%
rename from third_party/s2n-bignum/x86_att/p521/p521_jscalarmul_alt.S
rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul_alt.S
index ee0fca779b4..cc2a9d83319 100644
--- a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul_alt.S
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul_alt.S
@@ -73,6 +73,7 @@
         cmovzq  TAB+JACSIZE*(I-1)+64+C*NUMSIZE(%rsp), %r12
 
 S2N_BN_SYMBOL(p521_jscalarmul_alt):
+        _CET_ENDBR
 
 // The Windows version literally calls the standard ABI version.
 // This simplifies the proofs since subroutine offsets are fixed.
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_add_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_add_p256k1.S
new file mode 100644
index 00000000000..c78ad8ffb00
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_add_p256k1.S
@@ -0,0 +1,101 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_add_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+#define y %rdx
+
+#define d0 %rcx
+#define d1 %r8
+#define d2 %r9
+#define d3 %r10
+
+#define dd %rax
+
+// These two re-use inputs x and y when safe to do so
+
+#define l %rsi
+#define c %rdx
+
+S2N_BN_SYMBOL(bignum_add_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Load and add the two inputs as 2^256 * (-c) + [d3;d2;d1;d0] = x + y
+
+        movq    (x), d0
+        addq    (y), d0
+        movq    8(x), d1
+        adcq    8(y), d1
+        movq    16(x), d2
+        adcq    16(y), d2
+        movq    24(x), d3
+        adcq    24(y), d3
+        sbbq    c, c
+
+// Create dd = d3 AND d2 AND d1 to condense the later comparison
+// We hope this will interleave with the addition, though we can't
+// express that directly as the AND operation destroys the carry flag.
+
+        movq    d1, dd
+        andq    d2, dd
+        andq    d3, dd
+
+// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256.
+// For the lowest word use d0 + 4294968273 >= 2^64 <=> ~4294968273 < d0
+
+        movq    $~4294968273, l
+        cmpq    d0, l
+        adcq    $0, dd
+        sbbq    $0, c
+
+// Now c <> 0 <=> z >= p_256k1, so mask the constant l accordingly
+
+        notq    l
+        cmovzq  c, l
+
+// If z >= p_256k1 do z := z - p_256k1, i.e. add l in 4 digits
+
+        addq    l, d0
+        movq    d0, (z)
+        adcq    $0, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    $0, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1.S
new file mode 100644
index 00000000000..cab05713000
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1.S
@@ -0,0 +1,107 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_p256k1
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = c, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256k1)
+        .text
+
+#define z %rdi
+
+// Temporarily moved here for initial multiply
+
+#define x %rcx
+#define c %rcx
+
+// Likewise this is thrown away after initial multiply
+
+#define d %rdx
+#define h %rdx
+
+#define a %rax
+#define ashort %eax
+#define q %rax
+
+#define d0 %rsi
+#define d1 %r8
+#define d2 %r9
+#define d3 %r10
+
+S2N_BN_SYMBOL(bignum_cmul_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Shuffle inputs (since we want multiplier in %rdx)
+
+        movq    %rdx, x
+        movq    %rsi, d
+
+// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0]
+
+        mulxq   (x), d0, d1
+        mulxq   8(x), a, d2
+        addq    a, d1
+        mulxq   16(x), a, d3
+        adcq    a, d2
+        mulxq   24(x), a, h
+        adcq    a, d3
+        adcq    $0, h
+
+// Now the quotient estimate is q = h + 1, and then we do the reduction,
+// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 =
+// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q)
+
+        leaq    1(h), q
+        movq    $4294968273, c
+        mulq    c
+
+        addq    %rax, d0
+        adcq    %rdx, d1
+        adcq    $0, d2
+        adcq    $0, d3
+
+// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF
+// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c.
+
+        movq    $0, a
+        cmovcq  a, c
+
+        subq    c, d0
+        movq    d0, (z)
+        sbbq    a, d1
+        movq    d1, 8(z)
+        sbbq    a, d2
+        movq    d2, 16(z)
+        sbbq    a, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1_alt.S
new file mode 100644
index 00000000000..3c9e95340ca
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1_alt.S
@@ -0,0 +1,119 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_p256k1_alt
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = c, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256k1_alt)
+        .text
+
+#define z %rdi
+
+// Temporarily moved here for initial multiply
+
+#define x %rcx
+#define c %rsi
+
+// Likewise this is thrown away after initial multiply
+
+#define d %rdx
+#define h %rdx
+
+#define a %rax
+#define ashort %eax
+#define q %rax
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %rcx
+
+S2N_BN_SYMBOL(bignum_cmul_p256k1_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Shuffle inputs (since we want %rdx for the high parts of products)
+
+        movq    %rdx, x
+
+// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0]
+
+        movq    (x), a
+        mulq    c
+        movq    a, d0
+        movq    d, d1
+
+        movq    8(x), a
+        xorq    d2, d2
+        mulq    c
+        addq    a, d1
+        adcq    d, d2
+
+        movq    16(x), a
+        mulq    c
+        addq    a, d2
+        adcq    $0, d
+
+        movq    24(x), a
+        movq    d, d3
+        mulq    c
+        addq    a, d3
+        adcq    $0, h
+
+// Now the quotient estimate is q = h + 1, and then we do the reduction,
+// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 =
+// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q)
+
+        leaq    1(h), q
+        movq    $4294968273, c
+        mulq    c
+
+        addq    %rax, d0
+        adcq    %rdx, d1
+        adcq    $0, d2
+        adcq    $0, d3
+
+// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF
+// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c.
+
+        movq    $0, a
+        cmovcq  a, c
+
+        subq    c, d0
+        movq    d0, (z)
+        sbbq    a, d1
+        movq    d1, 8(z)
+        sbbq    a, d2
+        movq    d2, 16(z)
+        sbbq    a, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_deamont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_deamont_p256k1.S
new file mode 100644
index 00000000000..46071c3b081
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_deamont_p256k1.S
@@ -0,0 +1,147 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from Montgomery form z := (x / 2^256) mod p_256k1,
+// Input x[4]; output z[4]
+//
+//    extern void bignum_deamont_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form,
+// "almost" meaning any 4-digit input will work, with no range restriction.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Re-use x variable for the negated multiplicative inverse of p_256k1
+
+#define w %rsi
+
+// The rotating registers for the 4 digits
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// Other variables. We need d == %rdx for mulx instructions
+
+#define a %rax
+#define d %rdx
+#define c %rcx
+
+S2N_BN_SYMBOL(bignum_deamont_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Set up an initial 4-word window [d3,d2,d1,d0] = x
+
+        movq    (x), d0
+        movq    8(x), d1
+        movq    16(x), d2
+        movq    24(x), d3
+
+// Set w to negated multiplicative inverse p_256k1 * w == -1 (mod 2^64).
+
+        movq    $0xd838091dd2253531, w
+
+// Four stages of Montgomery reduction, rotating the register window.
+// Use c as a carry-catcher since the imul destroys the flags in general.
+
+        imulq   w, d0
+        movq    $4294968273, a
+        mulq    d0
+        subq    d, d1
+        sbbq    c, c
+
+        imulq   w, d1
+        movq    $4294968273, a
+        mulq    d1
+        negq    c
+        sbbq    d, d2
+        sbbq    c, c
+
+        imulq   w, d2
+        movq    $4294968273, a
+        mulq    d2
+        negq    c
+        sbbq    d, d3
+        sbbq    c, c
+
+        imulq   w, d3
+        movq    $4294968273, a
+        mulq    d3
+
+// Take an AND of the four cofactor digits, re-using the w variable.
+// We hope this will interleave nicely with the computation sequence
+// above but don't want to use other registers explicitly, so put
+// it all together in a block.
+
+        movq    d0, w
+        andq    d1, w
+        andq    d2, w
+        andq    d3, w
+
+// Finish propagating carry through new top part
+
+        xorq    a, a
+        negq    c
+        sbbq    d, d0
+        sbbq    a, d1
+        sbbq    a, d2
+        sbbq    a, d3
+
+// The result thus far is z = (x + q * p_256k1) / 2^256. Note that
+// z < p_256k1 <=> x < (2^256 - q) * p_256k1, and since
+// x < 2^256 < 2 * p_256k1, we have that *if* q < 2^256 - 1 then
+// z < p_256k1. Conversely if q = 2^256 - 1 then since
+// x + q * p_256k1 == 0 (mod 2^256) we have x == p_256k1 (mod 2^256)
+// and thus x = p_256k1, and z >= p_256k1 (in fact z = p_256k1).
+// So in summary z < p_256k1 <=> ~(q = 2^256 - 1) <=> ~(x = p_256k1).
+// and hence iff q is all 1s, or equivalently dd is all 1s, we
+// correct by subtracting p_256k1 to get 0. Since this is only one
+// case we compute the result more explicitly rather than doing
+// arithmetic with carry propagation.
+
+        movq    $4294968273, d
+        addq    d0, d
+        addq    $1, w
+        cmovzq  d, d0
+        cmovzq  a, d1
+        cmovzq  a, d2
+        cmovzq  a, d3
+
+// write back and return
+
+        movq    d0, (z)
+        movq    d1, 8(z)
+        movq    d2, 16(z)
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_demont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_demont_p256k1.S
new file mode 100644
index 00000000000..2edc6be8d66
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_demont_p256k1.S
@@ -0,0 +1,114 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from Montgomery form z := (x / 2^256) mod p_256k1,
+// assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_demont_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// This assumes the input is < p_256k1 for correctness. If this is not the
+// case, use the variant "bignum_deamont_p256k1" instead.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Re-use x variable for the negated multiplicative inverse of p_256k1
+
+#define w %rsi
+
+// The rotating registers for the 4 digits
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// Other variables. We need d == %rdx for mulx instructions
+
+#define a %rax
+#define d %rdx
+#define c %rcx
+
+S2N_BN_SYMBOL(bignum_demont_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Set up an initial 4-word window [d3,d2,d1,d0] = x
+
+        movq    (x), d0
+        movq    8(x), d1
+        movq    16(x), d2
+        movq    24(x), d3
+
+// Set w to negated multiplicative inverse p_256k1 * w == -1 (mod 2^64).
+
+        movq    $0xd838091dd2253531, w
+
+// Four stages of Montgomery reduction, rotating the register window.
+// Use c as a carry-catcher since the imul destroys the flags in general.
+
+        imulq   w, d0
+        movq    $4294968273, a
+        mulq    d0
+        subq    d, d1
+        sbbq    c, c
+
+        imulq   w, d1
+        movq    $4294968273, a
+        mulq    d1
+        negq    c
+        sbbq    d, d2
+        sbbq    c, c
+
+        imulq   w, d2
+        movq    $4294968273, a
+        mulq    d2
+        negq    c
+        sbbq    d, d3
+        sbbq    c, c
+
+        imulq   w, d3
+        movq    $4294968273, a
+        mulq    d3
+        negq    c
+        sbbq    d, d0
+
+// Finish propagating carry through new top part, write back and return
+
+        movq    d0, (z)
+        sbbq    $0, d1
+        movq    d1, 8(z)
+        sbbq    $0, d2
+        movq    d2, 16(z)
+        sbbq    $0, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_double_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_double_p256k1.S
new file mode 100644
index 00000000000..fa34aeff914
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_double_p256k1.S
@@ -0,0 +1,96 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_double_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rcx
+#define d1 %r8
+#define d2 %r9
+#define d3 %r10
+
+#define dd %rax
+#define c %rdx
+
+// Re-uses the input x when safe to do so
+
+#define l %rsi
+
+S2N_BN_SYMBOL(bignum_double_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the inputs and double top-down as z = 2^256 * c + [d3;d2;d1;d0]
+// While doing this, create an AND dd of [d3;d2;d1] to condense comparison
+
+        movq    24(x), d3
+        movq    d3, c
+        movq    16(x), d2
+        shrq    $63, c
+        shldq   $1, d2, d3
+        movq    d3, dd
+        movq    8(x), d1
+        shldq   $1, d1, d2
+        andq    d2, dd
+        movq    (x), d0
+        shldq   $1, d0, d1
+        andq    d1, dd
+        shlq    $1, d0
+
+// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256.
+// For the lowest word use d0 + 4294968273 >= 2^64 <=> ~4294968273 < d0
+
+        movq    $~4294968273, l
+        cmpq    d0, l
+        adcq    $0, dd
+        adcq    $0, c
+
+// Now c <> 0 <=> z >= p_256k1, so mask the constant l accordingly
+
+        notq    l
+        cmovzq  c, l
+
+// If z >= p_256k1 do z := z - p_256k1, i.e. add l in 4 digits
+
+        addq    l, d0
+        movq    d0, (z)
+        adcq    $0, d1
+        movq    d1, 8(z)
+        adcq    $0, d2
+        movq    d2, 16(z)
+        adcq    $0, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_half_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_half_p256k1.S
new file mode 100644
index 00000000000..da8317a9e42
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_half_p256k1.S
@@ -0,0 +1,86 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_half_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rcx
+#define d1 %rdx
+#define d2 %r8
+#define d3 %r9
+
+#define c %rax
+
+S2N_BN_SYMBOL(bignum_half_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the 4 digits of x, and letting b be the LSB (whether it's odd)
+// construct the constant c = 4294968273 * b
+
+        movq    (x), d0
+        movq    $4294968273, c
+        movq    8(x), d1
+        movq    $1, d3
+        andq    d0, d3
+        movq    16(x), d2
+        cmovzq  d3, c
+        movq    24(x), d3
+
+// We want (x + b * p_256k1) / 2 where b is that LSB, in {0,1}.
+// That amounts to (2^256 * b + x - 4294968273 * b) / 2, and
+// modulo 4 words that's the same as ([2^256 * c + x] - c) / 2.
+// So do that subtraction and shift a place right as we go.
+
+        subq    c, d0
+        sbbq    $0, d1
+        sbbq    $0, d2
+        sbbq    $0, d3
+        sbbq    $0, c
+
+// Shift right, pushing the carry back down, and store back
+
+        shrdq   $1, d1, d0
+        movq    d0, (z)
+        shrdq   $1, d2, d1
+        movq    d1, 8(z)
+        shrdq   $1, d3, d2
+        movq    d2, 16(z)
+        shrdq   $1, c, d3
+        movq    d3, 24(z)
+
+// Return
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_n256k1_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_n256k1_4.S
new file mode 100644
index 00000000000..79fb4d9adc3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_n256k1_4.S
@@ -0,0 +1,98 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_n256k1_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the group order of the secp256k1 curve.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256k1_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256k1_4)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n0 %rax
+#define n1 %r10
+#define n2 %r11
+
+#define n2short %r11d
+
+// Can re-use this as a temporary once we've loaded the input
+
+#define c %rsi
+
+S2N_BN_SYMBOL(bignum_mod_n256k1_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load a set of registers [0; n2; n1; n0] = 2^256 - n_256k1
+
+        movq    $0x402da1732fc9bebf, n0
+        movq    $0x4551231950b75fc4, n1
+        movl    $1, n2short
+
+// Load the input and compute x + (2^256 - n_256k1)
+
+        movq    (x), d0
+        addq    n0, d0
+        movq    8(x), d1
+        adcq    n1, d1
+        movq    16(x), d2
+        adcq    n2, d2
+        movq    24(x), d3
+        adcq    $0, d3
+
+// Now CF is set iff 2^256 <= x + (2^256 - n_256k1), i.e. iff n_256k1 <= x.
+// Create a mask for the condition x < n, and mask the three nontrivial digits
+// ready to undo the previous addition with a compensating subtraction
+
+        sbbq    c, c
+        notq    c
+        andq    c, n0
+        andq    c, n1
+        andq    c, n2
+
+// Now subtract mask * (2^256 - n_256k1) again and store
+
+        subq    n0, d0
+        movq    d0, (z)
+        sbbq    n1, d1
+        movq    d1, 8(z)
+        sbbq    n2, d2
+        movq    d2, 16(z)
+        sbbq    $0, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_p256k1_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_p256k1_4.S
new file mode 100644
index 00000000000..b6519abf266
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_p256k1_4.S
@@ -0,0 +1,83 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_p256k1_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256k1_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256k1_4)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+#define c %r10
+
+#define d %rax
+
+
+S2N_BN_SYMBOL(bignum_mod_p256k1_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the inputs as [d3;d2;d1;d0] and let d be an AND of [d3;d2;d1] to
+// condense the comparison below.
+
+        movq    (x), d0
+        movq    8(x), d1
+        movq    d1, d
+        movq    16(x), d2
+        andq    d2, d
+        movq    24(x), d3
+        andq    d3, d
+
+// Compare x >= p_256k1 = 2^256 - 4294968273 using condensed carry:
+// we get a carry from the lowest digit and all other digits are 1.
+// We end up with c and d as adjusted digits for x - p_256k1 if so.
+
+        movq    $4294968273, c
+        addq    d0, c
+        adcq    $0, d
+
+// If indeed x >= p_256k1 then x := x - p_256k1, using c and d
+// Either way, write back to z
+
+        cmovcq  c, d0
+        movq    d0, (z)
+        cmovcq  d, d1
+        movq    d1, 8(z)
+        cmovcq  d, d2
+        movq    d2, 16(z)
+        cmovcq  d, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1.S
new file mode 100644
index 00000000000..5d0b06b394f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1.S
@@ -0,0 +1,235 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_256k1, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_256k2 (in particular this is true if we are in
+// the "usual" case x < p_256k1 and y < p_256k1).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256k1)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// Copied in or set up
+
+#define y %rcx
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// Also used for multiplicative inverse in second part
+
+#define w %rbp
+
+// mulpadd(high,low,m) adds %rdx * m to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rbx as temporaries.
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rbx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rbx, high
+
+// mulpade(high,low,i) adds %rdx * x[i] to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax as a temporary, assuming high created from scratch
+// and that zero has value zero.
+
+#define mulpade(high,low,m)             \
+        mulxq   m, %rax, high ;           \
+        adcxq   %rax, low ;               \
+        adoxq   zero, high
+
+S2N_BN_SYMBOL(bignum_montmul_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Zero a register, which also makes sure we don't get a fake carry-in
+
+        xorl    zeroe, zeroe
+
+// Do the zeroth row, which is a bit different
+
+        movq    (y), %rdx
+
+        mulxq   (x), %r8, %r9
+        mulxq   8(x), %rax, %r10
+        addq    %rax, %r9
+        mulxq   16(x), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   24(x), %rax, %r12
+        adcq    %rax, %r11
+        adcq    zero, %r12
+
+// Add row 1
+
+        xorl    zeroe, zeroe
+        movq    8(y), %rdx
+        mulpadd(%r10,%r9,(x))
+        mulpadd(%r11,%r10,8(x))
+        mulpadd(%r12,%r11,16(x))
+        mulpade(%r13,%r12,24(x))
+        adcxq   zero, %r13
+
+// Add row 2
+
+        xorl    zeroe, zeroe
+        movq    16(y), %rdx
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        mulpadd(%r13,%r12,16(x))
+        mulpade(%r14,%r13,24(x));
+        adcxq   zero, %r14
+
+// Add row 3
+
+        xorl    zeroe, zeroe
+        movq    24(y), %rdx
+        mulpadd(%r12,%r11,(x))
+        mulpadd(%r13,%r12,8(x))
+        mulpadd(%r14,%r13,16(x));
+        mulpade(%r15,%r14,24(x));
+        adcxq   zero, %r15
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// Do Montgomery reductions, now using %rcx as a carry-saver.
+// A direct carry chain is possible using mulx exclusively, but it
+// requires more moves and overall seems to have lower performance.
+
+        movq    $0xd838091dd2253531, w
+        movq    $4294968273, %rbx
+
+// Montgomery reduce row 0
+
+        movq    %rbx, %rax
+        imulq   w, %r8
+        mulq    %r8
+        subq    %rdx, %r9
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 1
+
+        movq    %rbx, %rax
+        imulq   w, %r9
+        mulq    %r9
+        negq    %rcx
+        sbbq    %rdx, %r10
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 2
+
+        movq    %rbx, %rax
+        imulq   w, %r10
+        mulq    %r10
+        negq    %rcx
+        sbbq    %rdx, %r11
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 3
+
+        movq    %rbx, %rax
+        imulq   w, %r11
+        mulq    %r11
+        negq    %rcx
+
+// Now [%r15,%r14,%r13,%r12] := [%r15,%r14,%r13,%r12] + [%r11,%r10,%r9,%r8] - (%rdx + CF)
+
+        sbbq    %rdx, %r8
+        sbbq    $0, %r9
+        sbbq    $0, %r10
+        sbbq    $0, %r11
+
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        sbbq    w, w
+
+// Let b be the top carry captured just above as w = (2^64-1) * b
+// Now if [b,%r15,%r14,%r13,%r12] >= p_256k1, subtract p_256k1, i.e. add 4294968273
+// and either way throw away the top word. [b,%r15,%r14,%r13,%r12] - p_256k1 =
+// [(b - 1),%r15,%r14,%r13,%r12] + 4294968273. If [%r15,%r14,%r13,%r12] + 4294968273
+// gives carry flag CF then >= comparison is top = 0 <=> b - 1 + CF = 0 which
+// is equivalent to b \/ CF, and so to (2^64-1) * b + (2^64 - 1) + CF >= 2^64
+
+        movq    %r12, %r8
+        addq    %rbx, %r8
+        movq    %r13, %r9
+        adcq    $0, %r9
+        movq    %r14, %r10
+        adcq    $0, %r10
+        movq    %r15, %r11
+        adcq    $0, %r11
+
+        adcq    $-1, w
+
+// Write everything back
+
+        cmovcq  %r8, %r12
+        movq    %r12, (z)
+        cmovcq  %r9, %r13
+        movq    %r13, 8(z)
+        cmovcq  %r10, %r14
+        movq    %r14, 16(z)
+        cmovcq  %r11, %r15
+        movq    %r15, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1_alt.S
new file mode 100644
index 00000000000..81c7f805f2c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1_alt.S
@@ -0,0 +1,235 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_p256k1_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_256k1, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_256k2 (in particular this is true if we are in
+// the "usual" case x < p_256k1 and y < p_256k1).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256k1_alt)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// Copied in or set up
+
+#define y %rcx
+
+// Re-used for constants in second part
+
+#define w %rsi
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+S2N_BN_SYMBOL(bignum_montmul_p256k1_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Start the window as [%r10;%r9;%r8] with 00 product
+
+        movq    (x), %rax
+        mulq     (y)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+
+// Column 1
+
+        xorq    %r11, %r11
+        combads(%r10,%r9,(x),8(y))
+        combadz(%r11,%r10,%r9,8(x),(y))
+
+// Column 2
+
+        xorq    %r12, %r12
+        combadz(%r12,%r11,%r10,(x),16(y))
+        combadd(%r12,%r11,%r10,8(x),8(y))
+        combadd(%r12,%r11,%r10,16(x),(y))
+
+// Column 3
+
+        xorq    %r13, %r13
+        combadz(%r13,%r12,%r11,(x),24(y))
+        combadd(%r13,%r12,%r11,8(x),16(y))
+        combadd(%r13,%r12,%r11,16(x),8(y))
+        combadd(%r13,%r12,%r11,24(x),(y))
+
+// Column 4
+
+        xorq    %r14, %r14
+        combadz(%r14,%r13,%r12,8(x),24(y))
+        combadd(%r14,%r13,%r12,16(x),16(y))
+        combadd(%r14,%r13,%r12,24(x),8(y))
+
+// Column 5
+
+        xorq    %r15, %r15
+        combadz(%r15,%r14,%r13,16(x),24(y))
+        combadd(%r15,%r14,%r13,24(x),16(y))
+
+// Final work for columns 6 and 7
+
+        movq    24(x), %rax
+        mulq     24(y)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// Do Montgomery reductions, now using %rcx as a carry-saver.
+
+        movq    $0xd838091dd2253531, w
+        movq    $4294968273, %rbx
+
+// Montgomery reduce row 0
+
+        movq    %rbx, %rax
+        imulq   w, %r8
+        mulq    %r8
+        subq    %rdx, %r9
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 1
+
+        movq    %rbx, %rax
+        imulq   w, %r9
+        mulq    %r9
+        negq    %rcx
+        sbbq    %rdx, %r10
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 2
+
+        movq    %rbx, %rax
+        imulq   w, %r10
+        mulq    %r10
+        negq    %rcx
+        sbbq    %rdx, %r11
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 3
+
+        movq    %rbx, %rax
+        imulq   w, %r11
+        mulq    %r11
+        negq    %rcx
+
+// Now [%r15,%r14,%r13,%r12] := [%r15,%r14,%r13,%r12] + [%r11,%r10,%r9,%r8] - (%rdx + CF)
+
+        sbbq    %rdx, %r8
+        sbbq    $0, %r9
+        sbbq    $0, %r10
+        sbbq    $0, %r11
+
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        sbbq    w, w
+
+// Let b be the top carry captured just above as w = (2^64-1) * b
+// Now if [b,%r15,%r14,%r13,%r12] >= p_256k1, subtract p_256k1, i.e. add 4294968273
+// and either way throw away the top word. [b,%r15,%r14,%r13,%r12] - p_256k1 =
+// [(b - 1),%r15,%r14,%r13,%r12] + 4294968273. If [%r15,%r14,%r13,%r12] + 4294968273
+// gives carry flag CF then >= comparison is top = 0 <=> b - 1 + CF = 0 which
+// is equivalent to b \/ CF, and so to (2^64-1) * b + (2^64 - 1) + CF >= 2^64
+
+        movq    %r12, %r8
+        addq    %rbx, %r8
+        movq    %r13, %r9
+        adcq    $0, %r9
+        movq    %r14, %r10
+        adcq    $0, %r10
+        movq    %r15, %r11
+        adcq    $0, %r11
+
+        adcq    $-1, w
+
+// Write everything back
+
+        cmovcq  %r8, %r12
+        movq    %r12, (z)
+        cmovcq  %r9, %r13
+        movq    %r13, 8(z)
+        cmovcq  %r10, %r14
+        movq    %r14, 16(z)
+        cmovcq  %r11, %r15
+        movq    %r15, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1.S
new file mode 100644
index 00000000000..f1c8a62730f
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1.S
@@ -0,0 +1,213 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_256k1, assuming x^2 <= 2^256 * p_256k1, which
+// is guaranteed in particular if x < p_256k1 initially (the "intended" case).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Use this fairly consistently for a zero
+
+#define zero %rbp
+#define zeroe %ebp
+
+// Also use the same register for multiplicative inverse in Montgomery stage
+
+#define w %rbp
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rbx as temporaries
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rbx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rbx, high
+
+S2N_BN_SYMBOL(bignum_montsqr_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Compute [%r15;%r8] = [00] which we use later, but mainly
+// set up an initial window [%r14;...;%r9] = [23;03;01]
+
+        movq    (x), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   8(x), %r9, %r10
+        mulxq   24(x), %r11, %r12
+        movq    16(x), %rdx
+        mulxq   24(x), %r13, %r14
+
+// Clear our zero register, and also initialize the flags for the carry chain
+
+        xorl    zeroe, zeroe
+
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        movq    24(x), %rdx
+        mulpadd(%r13,%r12,8(x))
+        adcxq   zero, %r13
+        adoxq   zero, %r14
+        adcq    zero, %r14
+
+// Double and add to the 00 + 11 + 22 + 33 terms
+
+        xorl    zeroe, zeroe
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    8(x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    16(x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    24(x), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   zero, %r15
+        adoxq   zero, %r15
+
+// Now we have the full 8-digit square 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// Do Montgomery reductions, now using %rcx as a carry save
+
+        movq    $0xd838091dd2253531, w
+        movq    $4294968273, %rbx
+
+// Montgomery reduce row 0
+
+        movq    %rbx, %rax
+        imulq   w, %r8
+        mulq    %r8
+        subq    %rdx, %r9
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 1
+
+        movq    %rbx, %rax
+        imulq   w, %r9
+        mulq    %r9
+        negq    %rcx
+        sbbq    %rdx, %r10
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 2
+
+        movq    %rbx, %rax
+        imulq   w, %r10
+        mulq    %r10
+        negq    %rcx
+        sbbq    %rdx, %r11
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 3
+
+        movq    %rbx, %rax
+        imulq   w, %r11
+        mulq    %r11
+        negq    %rcx
+
+// Now [%r15,%r14,%r13,%r12] := [%r15,%r14,%r13,%r12] + [%r11,%r10,%r9,%r8] - (%rdx + CF)
+
+        sbbq    %rdx, %r8
+        sbbq    $0, %r9
+        sbbq    $0, %r10
+        sbbq    $0, %r11
+
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        sbbq    w, w
+
+// Let b be the top carry captured just above as w = (2^64-1) * b
+// Now if [b,%r15,%r14,%r13,%r12] >= p_256k1, subtract p_256k1, i.e. add 4294968273
+// and either way throw away the top word. [b,%r15,%r14,%r13,%r12] - p_256k1 =
+// [(b - 1),%r15,%r14,%r13,%r12] + 4294968273. If [%r15,%r14,%r13,%r12] + 4294968273
+// gives carry flag CF then >= comparison is top = 0 <=> b - 1 + CF = 0 which
+// is equivalent to b \/ CF, and so to (2^64-1) * b + (2^64 - 1) + CF >= 2^64
+
+        movq    %r12, %r8
+        addq    %rbx, %r8
+        movq    %r13, %r9
+        adcq    $0, %r9
+        movq    %r14, %r10
+        adcq    $0, %r10
+        movq    %r15, %r11
+        adcq    $0, %r11
+
+        adcq    $-1, w
+
+// Write everything back
+
+        cmovcq  %r8, %r12
+        movq    %r12, (z)
+        cmovcq  %r9, %r13
+        movq    %r13, 8(z)
+        cmovcq  %r10, %r14
+        movq    %r14, 16(z)
+        cmovcq  %r11, %r15
+        movq    %r15, 24(z)
+
+// Restore saved registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1_alt.S
new file mode 100644
index 00000000000..ba64eda56cc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1_alt.S
@@ -0,0 +1,218 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_p256k1_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_256k1, assuming x^2 <= 2^256 * p_256k1, which
+// is guaranteed in particular if x < p_256k1 initially (the "intended" case).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256k1_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Re-used for constants in second part
+
+#define w %rsi
+
+// Macro for the key "multiply and add to (c,h,l)" step, for square term
+
+#define combadd1(c,h,l,numa)                    \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa)                       \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+// A version doubling before adding, for non-square terms
+
+#define combadd2(c,h,l,numa,numb)               \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0, c ;                           \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+S2N_BN_SYMBOL(bignum_montsqr_p256k1_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Result term 0
+
+        movq    (x), %rax
+        mulq    %rax
+
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+
+// Result term 1
+
+       xorq    %r11, %r11
+       combadd2(%r11,%r10,%r9,(x),8(x))
+
+// Result term 2
+
+        xorq    %r12, %r12
+        combadd1(%r12,%r11,%r10,8(x))
+        combadd2(%r12,%r11,%r10,(x),16(x))
+
+// Result term 3
+
+        xorq    %r13, %r13
+        combadd2(%r13,%r12,%r11,(x),24(x))
+        combadd2(%r13,%r12,%r11,8(x),16(x))
+
+// Result term 4
+
+        xorq    %r14, %r14
+        combadd2(%r14,%r13,%r12,8(x),24(x))
+        combadd1(%r14,%r13,%r12,16(x))
+
+// Result term 5
+
+        xorq    %r15, %r15
+        combadd2(%r15,%r14,%r13,16(x),24(x))
+
+// Result term 6
+
+        combads(%r15,%r14,24(x))
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// Do Montgomery reductions, now using %rcx as a carry-saver.
+
+        movq    $0xd838091dd2253531, w
+        movq    $4294968273, %rbx
+
+// Montgomery reduce row 0
+
+        movq    %rbx, %rax
+        imulq   w, %r8
+        mulq    %r8
+        subq    %rdx, %r9
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 1
+
+        movq    %rbx, %rax
+        imulq   w, %r9
+        mulq    %r9
+        negq    %rcx
+        sbbq    %rdx, %r10
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 2
+
+        movq    %rbx, %rax
+        imulq   w, %r10
+        mulq    %r10
+        negq    %rcx
+        sbbq    %rdx, %r11
+        sbbq    %rcx, %rcx
+
+// Montgomery reduce row 3
+
+        movq    %rbx, %rax
+        imulq   w, %r11
+        mulq    %r11
+        negq    %rcx
+
+// Now [%r15,%r14,%r13,%r12] := [%r15,%r14,%r13,%r12] + [%r11,%r10,%r9,%r8] - (%rdx + CF)
+
+        sbbq    %rdx, %r8
+        sbbq    $0, %r9
+        sbbq    $0, %r10
+        sbbq    $0, %r11
+
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        sbbq    w, w
+
+// Let b be the top carry captured just above as w = (2^64-1) * b
+// Now if [b,%r15,%r14,%r13,%r12] >= p_256k1, subtract p_256k1, i.e. add 4294968273
+// and either way throw away the top word. [b,%r15,%r14,%r13,%r12] - p_256k1 =
+// [(b - 1),%r15,%r14,%r13,%r12] + 4294968273. If [%r15,%r14,%r13,%r12] + 4294968273
+// gives carry flag CF then >= comparison is top = 0 <=> b - 1 + CF = 0 which
+// is equivalent to b \/ CF, and so to (2^64-1) * b + (2^64 - 1) + CF >= 2^64
+
+        movq    %r12, %r8
+        addq    %rbx, %r8
+        movq    %r13, %r9
+        adcq    $0, %r9
+        movq    %r14, %r10
+        adcq    $0, %r10
+        movq    %r15, %r11
+        adcq    $0, %r11
+
+        adcq    $-1, w
+
+// Write everything back
+
+        cmovcq  %r8, %r12
+        movq    %r12, (z)
+        cmovcq  %r9, %r13
+        movq    %r13, 8(z)
+        cmovcq  %r10, %r14
+        movq    %r14, 16(z)
+        cmovcq  %r11, %r15
+        movq    %r15, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1.S
new file mode 100644
index 00000000000..b520622a177
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1.S
@@ -0,0 +1,184 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply modulo p_256k1, z := (x * y) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_mul_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p256k1)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// Copied in or set up
+
+#define y %rcx
+
+// A zero register
+
+#define zero %rbp
+#define zeroe %ebp
+
+// mulpadd(high,low,m) adds %rdx * m to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rbx as temporaries.
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rbx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rbx, high
+
+// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax as a temporary, assuming high created from scratch
+// and that zero has value zero.
+
+#define mulpade(high,low,m)             \
+        mulxq   m, %rax, high ;           \
+        adcxq   %rax, low ;               \
+        adoxq   zero, high
+
+S2N_BN_SYMBOL(bignum_mul_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Zero a register, which also makes sure we don't get a fake carry-in
+
+        xorl    zeroe, zeroe
+
+// Do the zeroth row, which is a bit different
+
+        movq    (y), %rdx
+
+        mulxq   (x), %r8, %r9
+        mulxq   8(x), %rax, %r10
+        addq    %rax, %r9
+        mulxq   16(x), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   24(x), %rax, %r12
+        adcq    %rax, %r11
+        adcq    zero, %r12
+
+// Add row 1
+
+        xorl    zeroe, zeroe
+        movq    8(y), %rdx
+        mulpadd(%r10,%r9,(x))
+        mulpadd(%r11,%r10,8(x))
+        mulpadd(%r12,%r11,16(x))
+        mulpade(%r13,%r12,24(x))
+        adcxq   zero, %r13
+
+// Add row 2
+
+        xorl    zeroe, zeroe
+        movq    16(y), %rdx
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        mulpadd(%r13,%r12,16(x))
+        mulpade(%r14,%r13,24(x));
+        adcxq   zero, %r14
+
+// Add row 3
+
+        xorl    zeroe, zeroe
+        movq    24(y), %rdx
+        mulpadd(%r12,%r11,(x))
+        mulpadd(%r13,%r12,8(x))
+        mulpadd(%r14,%r13,16(x));
+        mulpade(%r15,%r14,24(x));
+        adcxq   zero, %r15
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// and this is == 4294968273 * h + l (mod p_256k1)
+
+        movq    $4294968273, %rdx
+
+        xorl    zeroe, zeroe
+
+        mulpadd(%r9,%r8,%r12)
+        mulpadd(%r10,%r9,%r13)
+        mulpadd(%r11,%r10,%r14)
+        mulpade(%r12,%r11,%r15)
+        adcxq   zero, %r12
+
+// Now we have reduced to 5 digits, 2^256 * h + l = [%r12,%r11,%r10,%r9,%r8]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+
+        leaq    1(%r12), %rax
+        mulxq   %rax, %rax, %rbx
+        addq    %rax, %r8
+        adcq    %rbx, %r9
+        adcq    zero, %r10
+        adcq    zero, %r11
+
+// Now the effective answer is 2^256 * (CF - 1) + [%r11,%r10,%r9,%r8]
+// So we correct if CF = 0 by subtracting 4294968273, i.e. by
+// adding p_256k1 to the "full" answer
+
+        cmovcq  zero, %rdx
+        subq    %rdx, %r8
+        sbbq    zero, %r9
+        sbbq    zero, %r10
+        sbbq    zero, %r11
+
+// Write everything back
+
+        movq    %r8, (z)
+        movq    %r9, 8(z)
+        movq    %r10, 16(z)
+        movq    %r11, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1_alt.S
new file mode 100644
index 00000000000..f63667dcbf8
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1_alt.S
@@ -0,0 +1,211 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply modulo p_256k1, z := (x * y) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_mul_p256k1_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p256k1_alt)
+        .text
+
+// These are actually right
+
+#define z %rdi
+#define x %rsi
+
+// Copied in or set up
+
+#define y %rcx
+
+// Re-use input pointers later for constant and top carry
+
+#define d %rsi
+#define c %rcx
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+S2N_BN_SYMBOL(bignum_mul_p256k1_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Start the window as [%r10;%r9;%r8] with 00 product
+
+        movq    (x), %rax
+        mulq     (y)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+
+// Column 1
+
+        xorq    %r11, %r11
+        combads(%r10,%r9,(x),8(y))
+        combadd(%r11,%r10,%r9,8(x),(y))
+
+// Column 2
+
+        xorq    %r12, %r12
+        combadz(%r12,%r11,%r10,(x),16(y))
+        combadd(%r12,%r11,%r10,8(x),8(y))
+        combadd(%r12,%r11,%r10,16(x),(y))
+
+// Column 3
+
+        xorq    %r13, %r13
+        combadz(%r13,%r12,%r11,(x),24(y))
+        combadd(%r13,%r12,%r11,8(x),16(y))
+        combadd(%r13,%r12,%r11,16(x),8(y))
+        combadd(%r13,%r12,%r11,24(x),(y))
+
+// Column 4
+
+        xorq    %r14, %r14
+        combadz(%r14,%r13,%r12,8(x),24(y))
+        combadd(%r14,%r13,%r12,16(x),16(y))
+        combadd(%r14,%r13,%r12,24(x),8(y))
+
+// Column 5
+
+        xorq    %r15, %r15
+        combadz(%r15,%r14,%r13,16(x),24(y))
+        combadd(%r15,%r14,%r13,24(x),16(y))
+
+// Final work for columns 6 and 7
+
+        movq    24(x), %rax
+        mulq     24(y)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// and this is == 4294968273 * h + l (mod p_256k1)
+
+        movq    $4294968273, d
+
+        movq    %r12, %rax
+        mulq    d
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    c, c
+
+        movq    %r13, %rax
+        mulq    d
+        subq    c, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    c, c
+
+        movq    %r14, %rax
+        mulq    d
+        subq    c, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    c, c
+
+        movq    %r15, %rax
+        mulq    d
+        subq    c, %rdx
+        xorq    c, c
+        addq    %rax, %r11
+        movq    %rdx, %r12
+        adcq    c, %r12
+
+// Now we have reduced to 5 digits, 2^256 * h + l = [%r12,%r11,%r10,%r9,%r8]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+
+        leaq    1(%r12), %rax
+        mulq    d
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        adcq    c, %r10
+        adcq    c, %r11
+
+// Now the effective answer is 2^256 * (CF - 1) + [%r11,%r10,%r9,%r8]
+// So we correct if CF = 0 by subtracting 4294968273, i.e. by
+// adding p_256k1 to the "full" answer
+
+        sbbq    %rax, %rax
+        notq    %rax
+        andq    d, %rax
+        subq    %rax, %r8
+        sbbq    c, %r9
+        sbbq    c, %r10
+        sbbq    c, %r11
+
+// Write everything back
+
+        movq    %r8, (z)
+        movq    %r9, 8(z)
+        movq    %r10, 16(z)
+        movq    %r11, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_neg_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_neg_p256k1.S
new file mode 100644
index 00000000000..f7594ffe5b7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_neg_p256k1.S
@@ -0,0 +1,84 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_neg_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define q %rdx
+#define n0 %rax
+#define n1 %rcx
+#define n2 %r8
+#define n3 %r9
+
+#define c %r10
+
+#define qshort %esi
+
+S2N_BN_SYMBOL(bignum_neg_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the 4 digits of x and let q be an OR of all the digits
+
+        movq    (x), n0
+        movq    n0, q
+        movq    8(x), n1
+        orq     n1, q
+        movq    16(x), n2
+        orq     n2, q
+        movq    24(x), n3
+        orq     n3, q
+
+// Turn q into a strict bitmask, and c a masked constant -4294968273
+
+        negq    q
+        sbbq    q, q
+        movq    $-4294968273, c
+        andq    q, c
+
+// Now just do [2^256 - 4294968273] - x where the constant is masked
+
+        subq    n0, c
+        movq    c, (z)
+        movq    q, c
+        sbbq    n1, c
+        movq    c, 8(z)
+        movq    q, c
+        sbbq    n2, c
+        movq    c, 16(z)
+        sbbq    n3, q
+        movq    q, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_optneg_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_optneg_p256k1.S
new file mode 100644
index 00000000000..657c742f8f3
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_optneg_p256k1.S
@@ -0,0 +1,94 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or
+// z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+//
+//    extern void bignum_optneg_p256k1
+//     (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = p, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = p, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p256k1)
+        .text
+
+#define z %rdi
+#define q %rsi
+#define x %rdx
+
+#define n0 %rax
+#define n1 %rcx
+#define n2 %r8
+#define n3 %r9
+
+#define c %r10
+
+#define qshort %esi
+
+S2N_BN_SYMBOL(bignum_optneg_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Load the 4 digits of x and let c be an OR of all the digits
+
+        movq    (x), n0
+        movq    n0, c
+        movq    8(x), n1
+        orq     n1, c
+        movq    16(x), n2
+        orq     n2, c
+        movq    24(x), n3
+        orq     n3, c
+
+// Turn q into a strict bitmask. Force it to zero if the input is zero,
+// to avoid giving -0 = p_256k1, which is not reduced though correct modulo.
+
+        cmovzq  c, q
+        negq    q
+        sbbq    q, q
+
+// We want z := if q then (2^256 - 4294968273) - x else x
+// which is: [if q then ~x else x] - [if q then 4294968272 else 0]
+
+        xorq    q, n0
+        xorq    q, n1
+        xorq    q, n2
+        xorq    q, n3
+
+        movq    $4294968272, c
+        andq    q, c
+        xorl    qshort, qshort
+
+        subq    c, n0
+        movq    n0, (z)
+        sbbq    q, n1
+        movq    n1, 8(z)
+        sbbq    q, n2
+        movq    n2, 16(z)
+        sbbq    q, n3
+        movq    n3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1.S
new file mode 100644
index 00000000000..21959c8ec99
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1.S
@@ -0,0 +1,175 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square modulo p_256k1, z := (x^2) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_sqr_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Use this fairly consistently for a zero
+
+#define zero %rbx
+#define zeroe %ebx
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rcx as temporaries
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rcx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rcx, high
+
+// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax as a temporary, assuming high created from scratch
+// and that zero has value zero.
+
+#define mulpade(high,low,m)             \
+        mulxq   m, %rax, high ;           \
+        adcxq   %rax, low ;               \
+        adoxq   zero, high
+
+S2N_BN_SYMBOL(bignum_sqr_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Compute [%r15;%r8] = [00] which we use later, but mainly
+// set up an initial window [%r14;...;%r9] = [23;03;01]
+
+        movq    (x), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   8(x), %r9, %r10
+        mulxq   24(x), %r11, %r12
+        movq    16(x), %rdx
+        mulxq   24(x), %r13, %r14
+
+// Clear our zero register, and also initialize the flags for the carry chain
+
+        xorl    zeroe, zeroe
+
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        movq    24(x), %rdx
+        mulpadd(%r13,%r12,8(x))
+        adcxq   zero, %r13
+        adoxq   zero, %r14
+        adcq    zero, %r14
+
+// Double and add to the 00 + 11 + 22 + 33 terms
+
+        xorl    zeroe, zeroe
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    8(x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    16(x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    24(x), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   zero, %r15
+        adoxq   zero, %r15
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// and this is == 4294968273 * h + l (mod p_256k1)
+
+        movq    $4294968273, %rdx
+
+        xorl    zeroe, zeroe
+
+        mulpadd(%r9,%r8,%r12)
+        mulpadd(%r10,%r9,%r13)
+        mulpadd(%r11,%r10,%r14)
+        mulpade(%r12,%r11,%r15)
+        adcxq   zero, %r12
+
+// Now we have reduced to 5 digits, 2^256 * h + l = [%r12,%r11,%r10,%r9,%r8]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+
+        leaq    1(%r12), %rax
+        mulxq   %rax, %rax, %rcx
+        addq    %rax, %r8
+        adcq    %rcx, %r9
+        adcq    zero, %r10
+        adcq    zero, %r11
+
+// Now the effective answer is 2^256 * (CF - 1) + [%r11,%r10,%r9,%r8]
+// So we correct if CF = 0 by subtracting 4294968273, i.e. by
+// adding p_256k1 to the "full" answer
+
+        sbbq    %rax, %rax
+        notq    %rax
+        andq    %rdx, %rax
+        subq    %rax, %r8
+        sbbq    zero, %r9
+        sbbq    zero, %r10
+        sbbq    zero, %r11
+
+// Write everything back
+
+        movq    %r8, (z)
+        movq    %r9, 8(z)
+        movq    %r10, 16(z)
+        movq    %r11, 24(z)
+
+// Restore saved registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1_alt.S
new file mode 100644
index 00000000000..cebcd031d80
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1_alt.S
@@ -0,0 +1,195 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square modulo p_256k1, z := (x^2) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_sqr_p256k1_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p256k1_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Re-use input pointer later for constant
+
+#define d %rsi
+#define c %rcx
+
+// Macro for the key "multiply and add to (c,h,l)" step, for square term
+
+#define combadd1(c,h,l,numa)                    \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa)                       \
+        movq    numa, %rax ;                      \
+        mulq    %rax;                            \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+// A version doubling before adding, for non-square terms
+
+#define combadd2(c,h,l,numa,numb)               \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0, c ;                           \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+S2N_BN_SYMBOL(bignum_sqr_p256k1_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Result term 0
+
+        movq    (x), %rax
+        mulq    %rax
+
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+
+// Result term 1
+
+       xorq    %r11, %r11
+       combadd2(%r11,%r10,%r9,(x),8(x))
+
+// Result term 2
+
+        xorq    %r12, %r12
+        combadd1(%r12,%r11,%r10,8(x))
+        combadd2(%r12,%r11,%r10,(x),16(x))
+
+// Result term 3
+
+        xorq    %r13, %r13
+        combadd2(%r13,%r12,%r11,(x),24(x))
+        combadd2(%r13,%r12,%r11,8(x),16(x))
+
+// Result term 4
+
+        xorq    %r14, %r14
+        combadd2(%r14,%r13,%r12,8(x),24(x))
+        combadd1(%r14,%r13,%r12,16(x))
+
+// Result term 5
+
+        xorq    %r15, %r15
+        combadd2(%r15,%r14,%r13,16(x),24(x))
+
+// Result term 6
+
+        combads(%r15,%r14,24(x))
+
+// Now we have the full 8-digit product 2^256 * h + l where
+// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8]
+// and this is == 4294968273 * h + l (mod p_256k1)
+
+        movq    $4294968273, d
+
+        movq    %r12, %rax
+        mulq    d
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    c, c
+
+        movq    %r13, %rax
+        mulq    d
+        subq    c, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    c, c
+
+        movq    %r14, %rax
+        mulq    d
+        subq    c, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    c, c
+
+        movq    %r15, %rax
+        mulq    d
+        subq    c, %rdx
+        xorq    c, c
+        addq    %rax, %r11
+        movq    %rdx, %r12
+        adcq    c, %r12
+
+// Now we have reduced to 5 digits, 2^256 * h + l = [%r12,%r11,%r10,%r9,%r8]
+// Use q = h + 1 as the initial quotient estimate, either right or 1 too big.
+
+        leaq    1(%r12), %rax
+        mulq    d
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        adcq    c, %r10
+        adcq    c, %r11
+
+// Now the effective answer is 2^256 * (CF - 1) + [%r11,%r10,%r9,%r8]
+// So we correct if CF = 0 by subtracting 4294968273, i.e. by
+// adding p_256k1 to the "full" answer
+
+        sbbq    %rax, %rax
+        notq    %rax
+        andq    d, %rax
+        subq    %rax, %r8
+        sbbq    c, %r9
+        sbbq    c, %r10
+        sbbq    c, %r11
+
+// Write everything back
+
+        movq    %r8, (z)
+        movq    %r9, 8(z)
+        movq    %r10, 16(z)
+        movq    %r11, 24(z)
+
+// Restore registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sub_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sub_p256k1.S
new file mode 100644
index 00000000000..fe4582592c6
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sub_p256k1.S
@@ -0,0 +1,87 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo p_256k1, z := (x - y) mod p_256k1
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_sub_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+#define y %rdx
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+#define zero %rax
+#define zeroe %eax
+#define c %rcx
+
+S2N_BN_SYMBOL(bignum_sub_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Zero a register first
+
+        xorl    zeroe, zeroe
+
+// Load and subtract the two inputs as [d3;d2;d1;d0] = x - y (modulo 2^256)
+
+        movq    (x), d0
+        subq    (y), d0
+        movq    8(x), d1
+        sbbq    8(y), d1
+        movq    16(x), d2
+        sbbq    16(y), d2
+        movq    24(x), d3
+        sbbq    24(y), d3
+
+// Now if x < y we want to add back p_256k1, which staying within 4 digits
+// means subtracting 4294968273, since p_256k1 = 2^256 - 4294968273.
+// Let c be that constant 4294968273 when x < y, zero otherwise.
+
+        movq    $4294968273, c
+        cmovncq zero, c
+
+// Now correct by adding masked p_256k1, i.e. subtracting c, and write back
+
+        subq    c, d0
+        movq    d0, (z)
+        sbbq    zero, d1
+        movq    d1, 8(z)
+        sbbq    zero, d2
+        movq    d2, 16(z)
+        sbbq    zero, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1.S
new file mode 100644
index 00000000000..92f97e05677
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1.S
@@ -0,0 +1,105 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert to Montgomery form z := (2^256 * x) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_tomont_p256k1
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d %rdx
+
+#define a %rax
+#define ashort %eax
+#define q %rax
+
+#define d0 %rcx
+#define d1 %r8
+#define d2 %r9
+#define d3 %r10
+
+// Re-use the x argument later on when it's no longer needed
+
+#define h %rsi
+#define c %rsi
+
+S2N_BN_SYMBOL(bignum_tomont_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Since 2^256 == 4294968273 (mod p_256k1) we more or less just set
+// m = 4294968273 then devolve to a variant of bignum_cmul_p256k1;
+// the logic that q = h + 1 < 2^64 and hence doesn't wrap still holds
+// since the multiplier 4294968273 is known to be much less than 2^64.
+// We keep this constant in %rdx throughout as it's used repeatedly.
+
+        movq    $4294968273, d
+
+// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0]
+// But immediately add 1 to h to get q = h + 1 as the quotient estimate.
+
+        mulxq   (x), d0, d1
+        mulxq   8(x), a, d2
+        addq    a, d1
+        mulxq   16(x), a, d3
+        adcq    a, d2
+        mulxq   24(x), a, h
+        adcq    a, d3
+        adcq    $1, h
+
+// Now the quotient estimate is q = h + 1, and then we do the reduction,
+// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 =
+// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q)
+
+        mulxq   h, a, c
+
+        addq    a, d0
+        adcq    c, d1
+        adcq    $0, d2
+        adcq    $0, d3
+
+// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF
+// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c.
+
+        movq    $0, a
+        cmovcq  a, d
+
+        subq    d, d0
+        movq    d0, (z)
+        sbbq    a, d1
+        movq    d1, 8(z)
+        sbbq    a, d2
+        movq    d2, 16(z)
+        sbbq    a, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1_alt.S
new file mode 100644
index 00000000000..572a7883083
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1_alt.S
@@ -0,0 +1,115 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert to Montgomery form z := (2^256 * x) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_tomont_p256k1_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256k1_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define c %rcx
+#define d %rdx
+#define h %rdx
+
+#define a %rax
+#define ashort %eax
+#define q %rax
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %rsi
+
+S2N_BN_SYMBOL(bignum_tomont_p256k1_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Since 2^256 == 4294968273 (mod p_256k1) we more or less just set
+// m = 4294968273 then devolve to a variant of bignum_cmul_p256k1;
+// the logic that q = h + 1 < 2^64 and hence doesn't wrap still holds
+// since the multiplier 4294968273 is known to be much less than 2^64.
+// We keep this constant in %rcx throughout as it's used repeatedly.
+
+        movq    $4294968273, c
+
+// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0]
+
+        movq    (x), a
+        mulq    c
+        movq    a, d0
+        movq    d, d1
+
+        movq    8(x), a
+        xorq    d2, d2
+        mulq    c
+        addq    a, d1
+        adcq    d, d2
+
+        movq    16(x), a
+        mulq    c
+        addq    a, d2
+        adcq    $0, d
+
+        movq    24(x), a
+        movq    d, d3
+        mulq    c
+        addq    a, d3
+        adcq    $0, h
+
+// Now the quotient estimate is q = h + 1, and then we do the reduction,
+// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 =
+// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q)
+
+        leaq    1(h), q
+        mulq    c
+
+        addq    %rax, d0
+        adcq    %rdx, d1
+        adcq    $0, d2
+        adcq    $0, d3
+
+// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF
+// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c.
+
+        movq    $0, a
+        cmovcq  a, c
+
+        subq    c, d0
+        movq    d0, (z)
+        sbbq    a, d1
+        movq    d1, 8(z)
+        sbbq    a, d2
+        movq    d2, 16(z)
+        sbbq    a, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1.S
new file mode 100644
index 00000000000..f9f3ef3cd49
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1.S
@@ -0,0 +1,120 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Triple modulo p_256k1, z := (3 * x) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_triple_p256k1
+//      (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The input x can be any 4-digit bignum, not necessarily reduced modulo
+// p_256k1, and the result is always fully reduced, z = (3 * x) mod p_256k1.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256k1)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256k1)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Main digits of intermediate results
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// Quotient estimate = top of product + 1
+
+#define q %rdx
+
+// Other temporary variables and their short version
+
+#define a %rax
+#define c %rcx
+
+#define ashort %eax
+#define qshort %edx
+
+S2N_BN_SYMBOL(bignum_triple_p256k1):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// First do the multiplication by 3, getting z = [h; d3; ...; d0]
+// but immediately form the quotient estimate q = h + 1
+
+        xorl    ashort, ashort
+
+        movq    (x), q
+        movq    q, d0
+        adcxq   q, q
+        adoxq   q, d0
+        movq    8(x), q
+        movq    q, d1
+        adcxq   q, q
+        adoxq   q, d1
+        movq    16(x), q
+        movq    q, d2
+        adcxq   q, q
+        adoxq   q, d2
+        movq    24(x), q
+        movq    q, d3
+        adcxq   q, q
+        adoxq   q, d3
+
+// For this limited range a simple quotient estimate of q = h + 1 works, where
+// h = floor(z / 2^256). Then -p_256k1 <= z - q * p_256k1 < p_256k1.
+
+        movl    $1, qshort
+        adcxq   a, q
+        adoxq   a, q
+
+// Initial subtraction of z - q * p_256k1, actually by adding q * 4294968273.
+
+        movq    $4294968273, c
+        xorq    a, a
+        imulq   c, q
+        addq    q, d0
+        adcq    a, d1
+        adcq    a, d2
+        adcq    a, d3
+
+// With z = 2^256 * h + l, the underlying result z' is actually
+// (2^256 * h + l) - q * (2^256 - 4294968273) = (l + q * 4294968273) - 2^256
+// so carry-clear <=> z' is negative. Correct by subtracting in that case.
+// In any case, write final result to z as we go.
+
+        cmovcq  a, c
+
+        subq    c, d0
+        movq    d0, (z)
+        sbbq    a, d1
+        movq    d1, 8(z)
+        sbbq    a, d2
+        movq    d2, 16(z)
+        sbbq    a, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1_alt.S
new file mode 100644
index 00000000000..ebd15d3c43a
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1_alt.S
@@ -0,0 +1,122 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Triple modulo p_256k1, z := (3 * x) mod p_256k1
+// Input x[4]; output z[4]
+//
+//    extern void bignum_triple_p256k1_alt
+//      (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The input x can be any 4-digit bignum, not necessarily reduced modulo
+// p_256k1, and the result is always fully reduced, z = (3 * x) mod p_256k1.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256k1_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256k1_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Main digits of intermediate results
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// Quotient estimate = top of product + 1
+
+#define d %rdx
+#define h %rdx
+#define q %rdx
+
+// Other temporary variables and their short version
+
+#define a %rax
+#define c %rcx
+
+#define ashort %eax
+#define qshort %edx
+
+S2N_BN_SYMBOL(bignum_triple_p256k1_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// First do the multiplication by 3, getting z = [h; d3; ...; d0]
+// but immediately form the quotient estimate q = h + 1
+
+        movq    $3, c
+
+        movq    (x), a
+        mulq    c
+        movq    a, d0
+        movq    d, d1
+
+        movq    8(x), a
+        xorq    d2, d2
+        mulq    c
+        addq    a, d1
+        adcq    d, d2
+
+        movq    16(x), a
+        mulq    c
+        addq    a, d2
+        adcq    $0, d
+
+        movq    24(x), a
+        movq    d, d3
+        mulq    c
+        addq    a, d3
+        adcq    $1, h
+
+// For this limited range a simple quotient estimate of q = h + 1 works, where
+// h = floor(z / 2^256). Then -p_256k1 <= z - q * p_256k1 < p_256k1.
+// Initial subtraction of z - q * p_256k1, actually by adding q * 4294968273.
+
+        movq    $4294968273, c
+        xorq    a, a
+        imulq   c, q
+        addq    q, d0
+        adcq    a, d1
+        adcq    a, d2
+        adcq    a, d3
+
+// With z = 2^256 * h + l, the underlying result z' is actually
+// (2^256 * h + l) - q * (2^256 - 4294968273) = (l + q * 4294968273) - 2^256
+// so carry-clear <=> z' is negative. Correct by subtracting in that case.
+// In any case, write final result to z as we go.
+
+        cmovcq  a, c
+
+        subq    c, d0
+        movq    d0, (z)
+        sbbq    a, d1
+        movq    d1, 8(z)
+        sbbq    a, d2
+        movq    d2, 16(z)
+        sbbq    a, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd.S
new file mode 100644
index 00000000000..3237c0aa797
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd.S
@@ -0,0 +1,425 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input points p1 and p2 are
+// fully reduced mod p_256k1, that both z coordinates are nonzero and
+// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents
+// the same affine point as".
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jadd)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// all of which are maintained throughout the code.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+#define z_2 (2*NUMSIZE)(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define x1a (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define z2sq (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define y1a (NUMSIZE*6)(%rsp)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds exactly to bignum_mul_p256k1
+
+#define mul_p256k1(P0,P1,P2)                      \
+        xorl   %ecx, %ecx ;                         \
+        movq   P2, %rdx ;                        \
+        mulxq  P1, %r8, %r9 ;                      \
+        mulxq  0x8+P1, %rax, %r10 ;                \
+        addq   %rax, %r9 ;                          \
+        mulxq  0x10+P1, %rax, %r11 ;               \
+        adcq   %rax, %r10 ;                         \
+        mulxq  0x18+P1, %rax, %r12 ;               \
+        adcq   %rax, %r11 ;                         \
+        adcq   %rcx, %r12 ;                         \
+        xorl   %ecx, %ecx ;                         \
+        movq   0x8+P2, %rdx ;                    \
+        mulxq  P1, %rax, %rbx ;                    \
+        adcxq  %rax, %r9 ;                          \
+        adoxq  %rbx, %r10 ;                         \
+        mulxq  0x8+P1, %rax, %rbx ;                \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rbx, %r11 ;                         \
+        mulxq  0x10+P1, %rax, %rbx ;               \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rbx, %r12 ;                         \
+        mulxq  0x18+P1, %rax, %r13 ;               \
+        adcxq  %rax, %r12 ;                         \
+        adoxq  %rcx, %r13 ;                         \
+        adcxq  %rcx, %r13 ;                         \
+        xorl   %ecx, %ecx ;                         \
+        movq   0x10+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                    \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rbx, %r11 ;                         \
+        mulxq  0x8+P1, %rax, %rbx ;                \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rbx, %r12 ;                         \
+        mulxq  0x10+P1, %rax, %rbx ;               \
+        adcxq  %rax, %r12 ;                         \
+        adoxq  %rbx, %r13 ;                         \
+        mulxq  0x18+P1, %rax, %r14 ;               \
+        adcxq  %rax, %r13 ;                         \
+        adoxq  %rcx, %r14 ;                         \
+        adcxq  %rcx, %r14 ;                         \
+        xorl   %ecx, %ecx ;                         \
+        movq   0x18+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                    \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rbx, %r12 ;                         \
+        mulxq  0x8+P1, %rax, %rbx ;                \
+        adcxq  %rax, %r12 ;                         \
+        adoxq  %rbx, %r13 ;                         \
+        mulxq  0x10+P1, %rax, %rbx ;               \
+        adcxq  %rax, %r13 ;                         \
+        adoxq  %rbx, %r14 ;                         \
+        mulxq  0x18+P1, %rax, %r15 ;               \
+        adcxq  %rax, %r14 ;                         \
+        adoxq  %rcx, %r15 ;                         \
+        adcxq  %rcx, %r15 ;                         \
+        movabs $0x1000003d1, %rdx ;                 \
+        xorl   %ecx, %ecx ;                         \
+        mulxq  %r12, %rax, %rbx ;                     \
+        adcxq  %rax, %r8 ;                          \
+        adoxq  %rbx, %r9 ;                          \
+        mulxq  %r13, %rax, %rbx ;                     \
+        adcxq  %rax, %r9 ;                          \
+        adoxq  %rbx, %r10 ;                         \
+        mulxq  %r14, %rax, %rbx ;                     \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rbx, %r11 ;                         \
+        mulxq  %r15, %rax, %r12 ;                     \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rcx, %r12 ;                         \
+        adcxq  %rcx, %r12 ;                         \
+        leaq   0x1(%r12), %rax ;                   \
+        mulxq  %rax, %rax, %rbx ;                     \
+        addq   %rax, %r8 ;                          \
+        adcq   %rbx, %r9 ;                          \
+        adcq   %rcx, %r10 ;                         \
+        adcq   %rcx, %r11 ;                         \
+        cmovbq %rcx, %rdx ;                         \
+        subq   %rdx, %r8 ;                          \
+        sbbq   %rcx, %r9 ;                          \
+        sbbq   %rcx, %r10 ;                         \
+        sbbq   %rcx, %r11 ;                         \
+        movq   %r8, P0 ;                         \
+        movq   %r9, 0x8+P0 ;                     \
+        movq   %r10, 0x10+P0 ;                   \
+        movq   %r11, 0x18+P0
+
+// Corresponds exactly to bignum_sqr_p256k1
+
+#define sqr_p256k1(P0,P1)                         \
+        movq   P1, %rdx ;                        \
+        mulxq  %rdx, %r8, %r15 ;                      \
+        mulxq  0x8+P1, %r9, %r10 ;                 \
+        mulxq  0x18+P1, %r11, %r12 ;               \
+        movq   0x10+P1, %rdx ;                   \
+        mulxq  0x18+P1, %r13, %r14 ;               \
+        xorl   %ebx, %ebx ;                         \
+        mulxq  P1, %rax, %rcx ;                    \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rcx, %r11 ;                         \
+        mulxq  0x8+P1, %rax, %rcx ;                \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rcx, %r12 ;                         \
+        movq   0x18+P1, %rdx ;                   \
+        mulxq  0x8+P1, %rax, %rcx ;                \
+        adcxq  %rax, %r12 ;                         \
+        adoxq  %rcx, %r13 ;                         \
+        adcxq  %rbx, %r13 ;                         \
+        adoxq  %rbx, %r14 ;                         \
+        adcq   %rbx, %r14 ;                         \
+        xorl   %ebx, %ebx ;                         \
+        adcxq  %r9, %r9 ;                           \
+        adoxq  %r15, %r9 ;                          \
+        movq   0x8+P1, %rdx ;                    \
+        mulxq  %rdx, %rax, %rdx ;                     \
+        adcxq  %r10, %r10 ;                         \
+        adoxq  %rax, %r10 ;                         \
+        adcxq  %r11, %r11 ;                         \
+        adoxq  %rdx, %r11 ;                         \
+        movq   0x10+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %rdx ;                     \
+        adcxq  %r12, %r12 ;                         \
+        adoxq  %rax, %r12 ;                         \
+        adcxq  %r13, %r13 ;                         \
+        adoxq  %rdx, %r13 ;                         \
+        movq   0x18+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %r15 ;                     \
+        adcxq  %r14, %r14 ;                         \
+        adoxq  %rax, %r14 ;                         \
+        adcxq  %rbx, %r15 ;                         \
+        adoxq  %rbx, %r15 ;                         \
+        movabs $0x1000003d1, %rdx ;                 \
+        xorl   %ebx, %ebx ;                         \
+        mulxq  %r12, %rax, %rcx ;                     \
+        adcxq  %rax, %r8 ;                          \
+        adoxq  %rcx, %r9 ;                          \
+        mulxq  %r13, %rax, %rcx ;                     \
+        adcxq  %rax, %r9 ;                          \
+        adoxq  %rcx, %r10 ;                         \
+        mulxq  %r14, %rax, %rcx ;                     \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rcx, %r11 ;                         \
+        mulxq  %r15, %rax, %r12 ;                     \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rbx, %r12 ;                         \
+        adcxq  %rbx, %r12 ;                         \
+        leaq   0x1(%r12), %rax ;                   \
+        mulxq  %rax, %rax, %rcx ;                     \
+        addq   %rax, %r8 ;                          \
+        adcq   %rcx, %r9 ;                          \
+        adcq   %rbx, %r10 ;                         \
+        adcq   %rbx, %r11 ;                         \
+        sbbq   %rax, %rax ;                         \
+        notq   %rax;                             \
+        andq   %rdx, %rax ;                         \
+        subq   %rax, %r8 ;                          \
+        sbbq   %rbx, %r9 ;                          \
+        sbbq   %rbx, %r10 ;                         \
+        sbbq   %rbx, %r11 ;                         \
+        movq   %r8, P0 ;                         \
+        movq   %r9, 0x8+P0 ;                     \
+        movq   %r10, 0x10+P0 ;                   \
+        movq   %r11, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256k1
+
+#define sub_p256k1(P0,P1,P2)                      \
+        xorl   %eax, %eax ;                         \
+        movq   P1, %r8 ;                         \
+        subq   P2, %r8 ;                         \
+        movq   0x8+P1, %r9 ;                     \
+        sbbq   0x8+P2, %r9 ;                     \
+        movq   0x10+P1, %r10 ;                   \
+        sbbq   0x10+P2, %r10 ;                   \
+        movq   0x18+P1, %r11 ;                   \
+        sbbq   0x18+P2, %r11 ;                   \
+        movabs $0x1000003d1, %rcx ;                 \
+        cmovae %rax, %rcx ;                         \
+        subq   %rcx, %r8 ;                          \
+        movq   %r8, P0 ;                         \
+        sbbq   %rax, %r9 ;                          \
+        movq   %r9, 0x8+P0 ;                     \
+        sbbq   %rax, %r10 ;                         \
+        movq   %r10, 0x10+P0 ;                   \
+        sbbq   %rax, %r11 ;                         \
+        movq   %r11, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+#define czload4(r0,r1,r2,r3,P)                  \
+        cmovzq  P, r0 ;                        \
+        cmovzq  8+P, r1 ;                      \
+        cmovzq  16+P, r2 ;                     \
+        cmovzq  24+P, r3
+
+#define muxload4(r0,r1,r2,r3,P0,P1,P2)          \
+        movq    P0, r0 ;                       \
+        cmovbq  P1, r0 ;                       \
+        cmovnbe P2, r0 ;                       \
+        movq    8+P0, r1 ;                     \
+        cmovbq  8+P1, r1 ;                     \
+        cmovnbe 8+P2, r1 ;                     \
+        movq    16+P0, r2 ;                    \
+        cmovbq  16+P1, r2 ;                    \
+        cmovnbe 16+P2, r2 ;                    \
+        movq    24+P0, r3 ;                    \
+        cmovbq  24+P1, r3 ;                    \
+        cmovnbe 24+P2, r3
+
+S2N_BN_SYMBOL(secp256k1_jadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it stays
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq   $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p256k1(z1sq,z_1)
+        sqr_p256k1(z2sq,z_2)
+
+        mul_p256k1(y1a,z_2,y_1)
+        mul_p256k1(y2a,z_1,y_2)
+
+        mul_p256k1(x2a,z1sq,x_2)
+        mul_p256k1(x1a,z2sq,x_1)
+        mul_p256k1(y2a,z1sq,y2a)
+        mul_p256k1(y1a,z2sq,y1a)
+
+        sub_p256k1(xd,x2a,x1a)
+        sub_p256k1(yd,y2a,y1a)
+
+        sqr_p256k1(zz,xd)
+        sqr_p256k1(ww,yd)
+
+        mul_p256k1(zzx1,zz,x1a)
+        mul_p256k1(zzx2,zz,x2a)
+
+        sub_p256k1(resx,ww,zzx1)
+        sub_p256k1(t1,zzx2,zzx1)
+
+        mul_p256k1(xd,xd,z_1)
+
+        sub_p256k1(resx,resx,zzx2)
+
+        sub_p256k1(t2,zzx1,resx)
+
+        mul_p256k1(t1,t1,y1a)
+        mul_p256k1(resz,xd,z_2)
+        mul_p256k1(t2,yd,t2)
+
+        sub_p256k1(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0)
+// and "B"  <=> CF          <=> ~(P1 = 0) /\ P2 = 0
+// and "Z"  <=> ZF          <=> (P1 = 0 <=> P2 = 0)
+
+        load4(%r8,%r9,%r10,%r11,z_1)
+
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+
+        load4(%r12,%r13,%r14,%r15,z_2)
+
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+
+        cmpq    %rax, %rbx
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+
+        czload4(%r12,%r13,%r14,%r15,resz)
+
+        muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2)
+        muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2)
+
+// Finally store back the multiplexed values
+
+        store4(x_3,%rax,%rbx,%rcx,%rdx)
+        store4(y_3,%r8,%r9,%r10,%r11)
+        store4(z_3,%r12,%r13,%r14,%r15)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd_alt.S
new file mode 100644
index 00000000000..abafb033c6d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd_alt.S
@@ -0,0 +1,506 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input points p1 and p2 are
+// fully reduced mod p_256k1, that both z coordinates are nonzero and
+// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents
+// the same affine point as".
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jadd_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// all of which are maintained throughout the code.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+#define z_2 (2*NUMSIZE)(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define x1a (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define z2sq (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define y1a (NUMSIZE*6)(%rsp)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds to bignum_mul_p256k1_alt except %rsi -> %rbx
+
+#define mul_p256k1(P0,P1,P2)                    \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    $0x0, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    $0x1000003d1, %rbx ;               \
+        movq    %r12, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r13, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r14, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r15, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        xorq    %rcx, %rcx ;                       \
+        addq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                 \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rax, %rax ;                       \
+        notq    %rax;                            \
+        andq    %rbx, %rax ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Corresponds to bignum_sqr_p256k1_alt except for %rsi -> %rbx
+
+#define sqr_p256k1(P0,P1)                       \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r11 ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    $0x0, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r12 ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r13 ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r13 ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r14 ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r15 ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    $0x1000003d1, %rbx ;               \
+        movq    %r12, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r13, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r14, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r15, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        xorq    %rcx, %rcx ;                       \
+        addq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                 \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rax, %rax ;                       \
+        notq    %rax;                            \
+        andq    %rbx, %rax ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256k1
+
+#define sub_p256k1(P0,P1,P2)                      \
+        xorl   %eax, %eax ;                         \
+        movq   P1, %r8 ;                         \
+        subq   P2, %r8 ;                         \
+        movq   0x8+P1, %r9 ;                     \
+        sbbq   0x8+P2, %r9 ;                     \
+        movq   0x10+P1, %r10 ;                   \
+        sbbq   0x10+P2, %r10 ;                   \
+        movq   0x18+P1, %r11 ;                   \
+        sbbq   0x18+P2, %r11 ;                   \
+        movabs $0x1000003d1, %rcx ;                 \
+        cmovae %rax, %rcx ;                         \
+        subq   %rcx, %r8 ;                          \
+        movq   %r8, P0 ;                         \
+        sbbq   %rax, %r9 ;                          \
+        movq   %r9, 0x8+P0 ;                     \
+        sbbq   %rax, %r10 ;                         \
+        movq   %r10, 0x10+P0 ;                   \
+        sbbq   %rax, %r11 ;                         \
+        movq   %r11, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+#define czload4(r0,r1,r2,r3,P)                  \
+        cmovzq  P, r0 ;                        \
+        cmovzq  8+P, r1 ;                      \
+        cmovzq  16+P, r2 ;                     \
+        cmovzq  24+P, r3
+
+#define muxload4(r0,r1,r2,r3,P0,P1,P2)          \
+        movq    P0, r0 ;                       \
+        cmovbq  P1, r0 ;                       \
+        cmovnbe P2, r0 ;                       \
+        movq    8+P0, r1 ;                     \
+        cmovbq  8+P1, r1 ;                     \
+        cmovnbe 8+P2, r1 ;                     \
+        movq    16+P0, r2 ;                    \
+        cmovbq  16+P1, r2 ;                    \
+        cmovnbe 16+P2, r2 ;                    \
+        movq    24+P0, r3 ;                    \
+        cmovbq  24+P1, r3 ;                    \
+        cmovnbe 24+P2, r3
+
+S2N_BN_SYMBOL(secp256k1_jadd_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it stays
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq   $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p256k1(z1sq,z_1)
+        sqr_p256k1(z2sq,z_2)
+
+        mul_p256k1(y1a,z_2,y_1)
+        mul_p256k1(y2a,z_1,y_2)
+
+        mul_p256k1(x2a,z1sq,x_2)
+        mul_p256k1(x1a,z2sq,x_1)
+        mul_p256k1(y2a,z1sq,y2a)
+        mul_p256k1(y1a,z2sq,y1a)
+
+        sub_p256k1(xd,x2a,x1a)
+        sub_p256k1(yd,y2a,y1a)
+
+        sqr_p256k1(zz,xd)
+        sqr_p256k1(ww,yd)
+
+        mul_p256k1(zzx1,zz,x1a)
+        mul_p256k1(zzx2,zz,x2a)
+
+        sub_p256k1(resx,ww,zzx1)
+        sub_p256k1(t1,zzx2,zzx1)
+
+        mul_p256k1(xd,xd,z_1)
+
+        sub_p256k1(resx,resx,zzx2)
+
+        sub_p256k1(t2,zzx1,resx)
+
+        mul_p256k1(t1,t1,y1a)
+        mul_p256k1(resz,xd,z_2)
+        mul_p256k1(t2,yd,t2)
+
+        sub_p256k1(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0)
+// and "B"  <=> CF          <=> ~(P1 = 0) /\ P2 = 0
+// and "Z"  <=> ZF          <=> (P1 = 0 <=> P2 = 0)
+
+        load4(%r8,%r9,%r10,%r11,z_1)
+
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+
+        load4(%r12,%r13,%r14,%r15,z_2)
+
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+
+        cmpq    %rax, %rbx
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+
+        czload4(%r12,%r13,%r14,%r15,resz)
+
+        muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2)
+        muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2)
+
+// Finally store back the multiplexed values
+
+        store4(x_3,%rax,%rbx,%rcx,%rdx)
+        store4(y_3,%r8,%r9,%r10,%r11)
+        store4(z_3,%r12,%r13,%r14,%r15)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble.S
new file mode 100644
index 00000000000..acecec83ca9
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble.S
@@ -0,0 +1,619 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_montjdouble
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input point are fully
+// reduced mod p_256k1 and that the z coordinate is not zero.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jdouble)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1, which is true when the
+// arguments come in initially and is not disturbed throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define x_2 (NUMSIZE*0)(%rsp)
+#define y_2 (NUMSIZE*1)(%rsp)
+#define d (NUMSIZE*2)(%rsp)
+#define tmp (NUMSIZE*3)(%rsp)
+#define x_4 (NUMSIZE*4)(%rsp)
+#define y_4 (NUMSIZE*6)(%rsp)
+#define dx2 (NUMSIZE*8)(%rsp)
+#define xy2 (NUMSIZE*10)(%rsp)
+
+#define NSPACE (NUMSIZE*12)
+
+// Corresponds exactly to bignum_mul_p256k1
+
+#define mul_p256k1(P0,P1,P2)                    \
+        xorl    %ecx, %ecx ;                        \
+        movq    P2, %rdx ;                       \
+        mulxq   P1, %r8, %r9 ;                     \
+        mulxq   0x8+P1, %rax, %r10 ;               \
+        addq    %rax, %r9 ;                         \
+        mulxq   0x10+P1, %rax, %r11 ;              \
+        adcq    %rax, %r10 ;                        \
+        mulxq   0x18+P1, %rax, %r12 ;              \
+        adcq    %rax, %r11 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movq    0x8+P2, %rdx ;                   \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x18+P1, %rax, %r13 ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rcx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movq    0x10+P2, %rdx ;                  \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        mulxq   0x18+P1, %rax, %r14 ;              \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rcx, %r14 ;                        \
+        adcxq   %rcx, %r14 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movq    0x18+P2, %rdx ;                  \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   0x18+P1, %rax, %r15 ;              \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rcx, %r15 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        movabsq $0x1000003d1, %rdx ;                \
+        xorl    %ecx, %ecx ;                        \
+        mulxq   %r12, %rax, %rbx ;                    \
+        adcxq   %rax, %r8 ;                         \
+        adoxq   %rbx, %r9 ;                         \
+        mulxq   %r13, %rax, %rbx ;                    \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r14, %rax, %rbx ;                    \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   %r15, %rax, %r12 ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rcx, %r12 ;                        \
+        adcxq   %rcx, %r12 ;                        \
+        leaq    0x1(%r12), %rax ;                  \
+        mulxq   %rax, %rax, %rbx ;                    \
+        addq    %rax, %r8 ;                         \
+        adcq    %rbx, %r9 ;                         \
+        adcq    %rcx, %r10 ;                        \
+        adcq    %rcx, %r11 ;                        \
+        cmovbq  %rcx, %rdx ;                        \
+        subq    %rdx, %r8 ;                         \
+        sbbq    %rcx, %r9 ;                         \
+        sbbq    %rcx, %r10 ;                        \
+        sbbq    %rcx, %r11 ;                        \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+// Corresponds exactly to bignum_sqr_p256k1
+
+#define sqr_p256k1(P0,P1)                       \
+        movq    P1, %rdx ;                       \
+        mulxq   %rdx, %r8, %r15 ;                     \
+        mulxq   0x8+P1, %r9, %r10 ;                \
+        mulxq   0x18+P1, %r11, %r12 ;              \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   0x18+P1, %r13, %r14 ;              \
+        xorl    %ebx, %ebx ;                        \
+        mulxq   P1, %rax, %rcx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rcx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rcx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rcx, %r12 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   0x8+P1, %rax, %rcx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rcx, %r13 ;                        \
+        adcxq   %rbx, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        adcq    %rbx, %r14 ;                        \
+        xorl    %ebx, %ebx ;                        \
+        adcxq   %r9, %r9 ;                          \
+        adoxq   %r15, %r9 ;                         \
+        movq    0x8+P1, %rdx ;                   \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r10, %r10 ;                        \
+        adoxq   %rax, %r10 ;                        \
+        adcxq   %r11, %r11 ;                        \
+        adoxq   %rdx, %r11 ;                        \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r12, %r12 ;                        \
+        adoxq   %rax, %r12 ;                        \
+        adcxq   %r13, %r13 ;                        \
+        adoxq   %rdx, %r13 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %r15 ;                    \
+        adcxq   %r14, %r14 ;                        \
+        adoxq   %rax, %r14 ;                        \
+        adcxq   %rbx, %r15 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        movabsq $0x1000003d1, %rdx ;                \
+        xorl    %ebx, %ebx ;                        \
+        mulxq   %r12, %rax, %rcx ;                    \
+        adcxq   %rax, %r8 ;                         \
+        adoxq   %rcx, %r9 ;                         \
+        mulxq   %r13, %rax, %rcx ;                    \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rcx, %r10 ;                        \
+        mulxq   %r14, %rax, %rcx ;                    \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rcx, %r11 ;                        \
+        mulxq   %r15, %rax, %r12 ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        adcxq   %rbx, %r12 ;                        \
+        leaq    0x1(%r12), %rax ;                  \
+        mulxq   %rax, %rax, %rcx ;                    \
+        addq    %rax, %r8 ;                         \
+        adcq    %rcx, %r9 ;                         \
+        adcq    %rbx, %r10 ;                        \
+        adcq    %rbx, %r11 ;                        \
+        sbbq    %rax, %rax ;                        \
+        notq    %rax;                            \
+        andq    %rdx, %rax ;                        \
+        subq    %rax, %r8 ;                         \
+        sbbq    %rbx, %r9 ;                         \
+        sbbq    %rbx, %r10 ;                        \
+        sbbq    %rbx, %r11 ;                        \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0
+
+// Rough versions producing 5-word results
+
+#define roughmul_p256k1(P0,P1,P2)               \
+        xorl    %ecx, %ecx ;                        \
+        movq    P2, %rdx ;                       \
+        mulxq   P1, %r8, %r9 ;                     \
+        mulxq   0x8+P1, %rax, %r10 ;               \
+        addq    %rax, %r9 ;                         \
+        mulxq   0x10+P1, %rax, %r11 ;              \
+        adcq    %rax, %r10 ;                        \
+        mulxq   0x18+P1, %rax, %r12 ;              \
+        adcq    %rax, %r11 ;                        \
+        adcq    %rcx, %r12 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movq    0x8+P2, %rdx ;                   \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x18+P1, %rax, %r13 ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rcx, %r13 ;                        \
+        adcxq   %rcx, %r13 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movq    0x10+P2, %rdx ;                  \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        mulxq   0x18+P1, %rax, %r14 ;              \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rcx, %r14 ;                        \
+        adcxq   %rcx, %r14 ;                        \
+        xorl    %ecx, %ecx ;                        \
+        movq    0x18+P2, %rdx ;                  \
+        mulxq   P1, %rax, %rbx ;                   \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        mulxq   0x8+P1, %rax, %rbx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rbx, %r13 ;                        \
+        mulxq   0x10+P1, %rax, %rbx ;              \
+        adcxq   %rax, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        mulxq   0x18+P1, %rax, %r15 ;              \
+        adcxq   %rax, %r14 ;                        \
+        adoxq   %rcx, %r15 ;                        \
+        adcxq   %rcx, %r15 ;                        \
+        movabsq $0x1000003d1, %rdx ;                \
+        xorl    %ecx, %ecx ;                        \
+        mulxq   %r12, %rax, %rbx ;                    \
+        adcxq   %rax, %r8 ;                         \
+        adoxq   %rbx, %r9 ;                         \
+        mulxq   %r13, %rax, %rbx ;                    \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rbx, %r10 ;                        \
+        mulxq   %r14, %rax, %rbx ;                    \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rbx, %r11 ;                        \
+        mulxq   %r15, %rax, %r12 ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rcx, %r12 ;                        \
+        adcxq   %rcx, %r12 ;                        \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0 ;                  \
+        movq    %r12, 0x20+P0
+
+#define roughsqr_p256k1(P0,P1)                  \
+        movq    P1, %rdx ;                       \
+        mulxq   %rdx, %r8, %r15 ;                     \
+        mulxq   0x8+P1, %r9, %r10 ;                \
+        mulxq   0x18+P1, %r11, %r12 ;              \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   0x18+P1, %r13, %r14 ;              \
+        xorl    %ebx, %ebx ;                        \
+        mulxq   P1, %rax, %rcx ;                   \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rcx, %r11 ;                        \
+        mulxq   0x8+P1, %rax, %rcx ;               \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rcx, %r12 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   0x8+P1, %rax, %rcx ;               \
+        adcxq   %rax, %r12 ;                        \
+        adoxq   %rcx, %r13 ;                        \
+        adcxq   %rbx, %r13 ;                        \
+        adoxq   %rbx, %r14 ;                        \
+        adcq    %rbx, %r14 ;                        \
+        xorl    %ebx, %ebx ;                        \
+        adcxq   %r9, %r9 ;                          \
+        adoxq   %r15, %r9 ;                         \
+        movq    0x8+P1, %rdx ;                   \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r10, %r10 ;                        \
+        adoxq   %rax, %r10 ;                        \
+        adcxq   %r11, %r11 ;                        \
+        adoxq   %rdx, %r11 ;                        \
+        movq    0x10+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                    \
+        adcxq   %r12, %r12 ;                        \
+        adoxq   %rax, %r12 ;                        \
+        adcxq   %r13, %r13 ;                        \
+        adoxq   %rdx, %r13 ;                        \
+        movq    0x18+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %r15 ;                    \
+        adcxq   %r14, %r14 ;                        \
+        adoxq   %rax, %r14 ;                        \
+        adcxq   %rbx, %r15 ;                        \
+        adoxq   %rbx, %r15 ;                        \
+        movabsq $0x1000003d1, %rdx ;                \
+        xorl    %ebx, %ebx ;                        \
+        mulxq   %r12, %rax, %rcx ;                    \
+        adcxq   %rax, %r8 ;                         \
+        adoxq   %rcx, %r9 ;                         \
+        mulxq   %r13, %rax, %rcx ;                    \
+        adcxq   %rax, %r9 ;                         \
+        adoxq   %rcx, %r10 ;                        \
+        mulxq   %r14, %rax, %rcx ;                    \
+        adcxq   %rax, %r10 ;                        \
+        adoxq   %rcx, %r11 ;                        \
+        mulxq   %r15, %rax, %r12 ;                    \
+        adcxq   %rax, %r11 ;                        \
+        adoxq   %rbx, %r12 ;                        \
+        adcxq   %rbx, %r12 ;                        \
+        movq    %r8, P0 ;                        \
+        movq    %r9, 0x8+P0 ;                    \
+        movq    %r10, 0x10+P0 ;                  \
+        movq    %r11, 0x18+P0 ;                  \
+        movq    %r12, 0x20+P0
+
+// Weak doubling operation, staying in 4 digits but not in general
+// fully normalizing
+
+#define weakdouble_p256k1(P0,P1)                \
+        movq    24+P1, %r11 ;                    \
+        movq    16+P1, %r10 ;                    \
+        movq    $0x1000003d1, %rax ;               \
+        xorq    %rdx, %rdx ;                       \
+        shldq   $1, %r10, %r11 ;                      \
+        cmovncq %rdx, %rax ;                       \
+        movq    8+P1, %r9 ;                      \
+        shldq   $1, %r9, %r10 ;                       \
+        movq    P1, %r8 ;                        \
+        shldq   $1, %r8, %r9 ;                        \
+        shlq    $1, %r8 ;                           \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %r11, 24+P0
+
+// P0 = C * P1 - D * P2 with 5-word inputs P1 and P2
+// Only used here with C = 12, D = 9, but could be used more generally.
+// We actually compute C * P1 + D * (2^33 * p_256k1 - P2)
+
+#define cmsub_p256k1(P0,C,P1,D,P2)              \
+        movq    $0xfffff85e00000000, %r8 ;         \
+        subq    P2, %r8 ;                       \
+        movq    $0xfffffffffffffffd, %r9 ;         \
+        sbbq    8+P2, %r9 ;                     \
+        movq    $0xffffffffffffffff, %r10 ;        \
+        sbbq    16+P2, %r10 ;                   \
+        movq    $0xffffffffffffffff, %r11 ;        \
+        sbbq    24+P2, %r11 ;                   \
+        movq    $0x00000001ffffffff, %r12 ;        \
+        sbbq    32+P2, %r12 ;                   \
+        movq    $D, %rdx ;                         \
+        mulxq   %r8, %r8, %rax ;                    \
+        mulxq   %r9, %r9, %rcx ;                    \
+        addq    %rax, %r9 ;                        \
+        mulxq   %r10, %r10, %rax ;                  \
+        adcq    %rcx, %r10 ;                       \
+        mulxq   %r11, %r11, %rcx ;                  \
+        adcq    %rax, %r11 ;                       \
+        mulxq   %r12, %r12, %rax ;                  \
+        adcq    %rcx, %r12 ;                       \
+        movq    $C, %rdx ;                         \
+        xorq    %rbx, %rbx ;                       \
+        mulxq   P1, %rax, %rcx ;                 \
+        adcxq   %rax, %r8 ;                        \
+        adoxq   %rcx, %r9 ;                        \
+        mulxq   8+P1, %rax, %rcx ;               \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rcx, %r10 ;                       \
+        mulxq   16+P1, %rax, %rcx ;              \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rcx, %r11 ;                       \
+        mulxq   24+P1, %rax, %rcx ;              \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rcx, %r12 ;                       \
+        mulxq   32+P1, %rax, %rcx ;              \
+        adcxq   %rax, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                  \
+        movq    $0x1000003d1, %rcx ;                \
+        mulq    %rcx;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        adcq    %rbx, %r10 ;                        \
+        adcq    %rbx, %r11 ;                        \
+        cmovbq  %rbx, %rcx ;                        \
+        subq    %rcx, %r8 ;                         \
+        movq    %r8, P0 ;                        \
+        sbbq    %rbx, %r9 ;                         \
+        movq    %r9, 8+P0 ;                      \
+        sbbq    %rbx, %r10 ;                        \
+        movq    %r10, 16+P0 ;                    \
+        sbbq    %rbx, %r11 ;                        \
+        movq    %r11, 24+P0 ;                    \
+
+// P0 = 3 * P1 - 8 * P2 with 5-digit P1 and P2
+// We actually compute 3 * P1 + (2^33 * p_256k1 - P2) << 3
+
+#define cmsub38_p256k1(P0,P1,P2)                \
+        movq    $0xfffff85e00000000, %r8 ;         \
+        subq    P2, %r8 ;                       \
+        movq    $0xfffffffffffffffd, %r9 ;         \
+        sbbq    8+P2, %r9 ;                     \
+        movq    $0xffffffffffffffff, %r10 ;        \
+        sbbq    16+P2, %r10 ;                   \
+        movq    $0xffffffffffffffff, %r11 ;        \
+        sbbq    24+P2, %r11 ;                   \
+        movq    $0x00000001ffffffff, %r12 ;        \
+        sbbq    32+P2, %r12 ;                   \
+        shldq   $3, %r11, %r12 ;                    \
+        shldq   $3, %r10, %r11 ;                    \
+        shldq   $3, %r9, %r10 ;                     \
+        shldq   $3, %r8, %r9 ;                      \
+        shlq    $3, %r8 ;                          \
+        movq    $3, %rdx ;                         \
+        xorq    %rbx, %rbx ;                       \
+        mulxq   P1, %rax, %rcx ;                 \
+        adcxq   %rax, %r8 ;                        \
+        adoxq   %rcx, %r9 ;                        \
+        mulxq   8+P1, %rax, %rcx ;               \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rcx, %r10 ;                       \
+        mulxq   16+P1, %rax, %rcx ;              \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rcx, %r11 ;                       \
+        mulxq   24+P1, %rax, %rcx ;              \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rcx, %r12 ;                       \
+        mulxq   32+P1, %rax, %rcx ;              \
+        adcxq   %rax, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                  \
+        movq    $0x1000003d1, %rcx ;                \
+        mulq    %rcx;                            \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        adcq    %rbx, %r10 ;                        \
+        adcq    %rbx, %r11 ;                        \
+        cmovbq  %rbx, %rcx ;                        \
+        subq    %rcx, %r8 ;                         \
+        movq    %r8, P0 ;                        \
+        sbbq    %rbx, %r9 ;                         \
+        movq    %r9, 8+P0 ;                      \
+        sbbq    %rbx, %r10 ;                        \
+        movq    %r10, 16+P0 ;                    \
+        sbbq    %rbx, %r11 ;                        \
+        movq    %r11, 24+P0 ;                    \
+
+// P0 = 4 * P1 - P2 with 5-digit P1, 4-digit P2 and result.
+// This is done by direct subtraction of P2 since the method
+// in bignum_cmul_p256k1 etc. for quotient estimation still
+// works when the value to be reduced is negative, as
+// long as it is  > -p_256k1, which is the case here.
+
+#define cmsub41_p256k1(P0,P1,P2)                \
+        movq    32+P1, %r12 ;                   \
+        movq    24+P1, %r11 ;                   \
+        shldq   $2, %r11, %r12 ;                    \
+        movq    16+P1, %r10 ;                   \
+        shldq   $2, %r10, %r11 ;                    \
+        movq    8+P1, %r9 ;                     \
+        shldq   $2, %r9, %r10 ;                     \
+        movq    P1, %r8 ;                       \
+        shldq   $2, %r8, %r9 ;                      \
+        shlq    $2, %r8 ;                          \
+        subq    P2, %r8 ;                       \
+        sbbq    8+P2, %r9 ;                     \
+        sbbq    16+P2, %r10 ;                   \
+        sbbq    24+P2, %r11 ;                   \
+        sbbq    $0, %r12 ;                         \
+        leaq    0x1(%r12), %rax ;                  \
+        movq    $0x1000003d1, %rcx ;                \
+        mulq    %rcx;                            \
+        xorq    %rbx, %rbx ;                       \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        adcq    $0x0, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        cmovbq  %rbx, %rcx ;                        \
+        subq    %rcx, %r8 ;                         \
+        movq    %r8, P0 ;                        \
+        sbbq    %rbx, %r9 ;                         \
+        movq    %r9, 8+P0 ;                      \
+        sbbq    %rbx, %r10 ;                        \
+        movq    %r10, 16+P0 ;                    \
+        sbbq    %rbx, %r11 ;                        \
+        movq    %r11, 24+P0 ;                    \
+
+S2N_BN_SYMBOL(secp256k1_jdouble):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room on stack for temporary variables
+
+        pushq  %rbx
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence of operations
+
+        // y_2 = y^2
+
+        sqr_p256k1(y_2,y_1)
+
+        // x_2 = x^2
+
+        sqr_p256k1(x_2,x_1)
+
+        // tmp = 2 * y_1 (in 4 words but not fully normalized)
+
+        weakdouble_p256k1(tmp,y_1)
+
+        // xy2 = x * y^2 (5-digit partially reduced)
+        // x_4 = x^4 (5-digit partially reduced)
+
+        roughmul_p256k1(xy2,x_1,y_2)
+        roughsqr_p256k1(x_4,x_2)
+
+        // z_3 = 2 * y_1 * z_1
+
+        mul_p256k1(z_3,z_1,tmp)
+
+        // d = 12 * xy2 - 9 * x_4
+
+        cmsub_p256k1(d,12,xy2,9,x_4)
+
+        // y4 = y2^2 (5-digit partially reduced)
+
+        roughsqr_p256k1(y_4,y_2)
+
+        // dx2 = d * x_2 (5-digit partially reduced)
+
+        roughmul_p256k1(dx2,x_2,d)
+
+        // x_3 = 4 * xy2 - d
+
+        cmsub41_p256k1(x_3,xy2,d)
+
+        // y_3 = 3 * dx2 - 8 * y_4
+
+        cmsub38_p256k1(y_3,dx2,y_4)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble_alt.S
new file mode 100644
index 00000000000..1452f4a3a93
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble_alt.S
@@ -0,0 +1,813 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_montjdouble_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// It is assumed that all coordinates of the input point are fully
+// reduced mod p_256k1 and that the z coordinate is not zero.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jdouble_alt)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1, which is true when the
+// arguments come in initially and is not disturbed throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define x_2 (NUMSIZE*0)(%rsp)
+#define y_2 (NUMSIZE*1)(%rsp)
+#define d (NUMSIZE*2)(%rsp)
+#define tmp (NUMSIZE*3)(%rsp)
+#define x_4 (NUMSIZE*4)(%rsp)
+#define y_4 (NUMSIZE*6)(%rsp)
+#define dx2 (NUMSIZE*8)(%rsp)
+#define xy2 (NUMSIZE*10)(%rsp)
+
+#define NSPACE (NUMSIZE*12)
+
+// Corresponds to bignum_mul_p256k1_alt except %rsi -> %rbx
+
+#define mul_p256k1(P0,P1,P2)                    \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    $0x0, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    $0x1000003d1, %rbx ;               \
+        movq    %r12, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r13, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r14, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r15, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        xorq    %rcx, %rcx ;                       \
+        addq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                 \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rax, %rax ;                       \
+        notq    %rax;                            \
+        andq    %rbx, %rax ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Corresponds to bignum_sqr_p256k1_alt except for %rsi -> %rbx
+
+#define sqr_p256k1(P0,P1)                       \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r11 ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    $0x0, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r12 ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r13 ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r13 ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r14 ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r15 ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    $0x1000003d1, %rbx ;               \
+        movq    %r12, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r13, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r14, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r15, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        xorq    %rcx, %rcx ;                       \
+        addq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                 \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rax, %rax ;                       \
+        notq    %rax;                            \
+        andq    %rbx, %rax ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Rough versions producing 5-word results
+
+#define roughmul_p256k1(P0,P1,P2)               \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    $0x0, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    $0x1000003d1, %rbx ;               \
+        movq    %r12, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r13, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r14, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r15, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        xorq    %rcx, %rcx ;                       \
+        addq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0 ;                 \
+        movq    %r12, 0x20+P0
+
+#define roughsqr_p256k1(P0,P1)                  \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r11 ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    $0x0, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r12 ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r13 ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r13 ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r14 ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r15 ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    $0x1000003d1, %rbx ;               \
+        movq    %r12, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r13, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r14, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r15, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        xorq    %rcx, %rcx ;                       \
+        addq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0 ;                  \
+        movq    %r12, 0x20+P0
+
+// Weak doubling operation, staying in 4 digits but not in general
+// fully normalizing
+
+#define weakdouble_p256k1(P0,P1)                \
+        movq    24+P1, %r11 ;                    \
+        movq    16+P1, %r10 ;                    \
+        movq    $0x1000003d1, %rax ;               \
+        xorq    %rdx, %rdx ;                       \
+        shldq   $1, %r10, %r11 ;                      \
+        cmovncq %rdx, %rax ;                       \
+        movq    8+P1, %r9 ;                      \
+        shldq   $1, %r9, %r10 ;                       \
+        movq    P1, %r8 ;                        \
+        shldq   $1, %r8, %r9 ;                        \
+        shlq    $1, %r8 ;                           \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 8+P0 ;                     \
+        movq    %r10, 16+P0 ;                   \
+        movq    %r11, 24+P0
+
+// P0 = C * P1 - D * P2 with 5-word inputs P1 and P2
+// Only used here with C = 12, D = 9, but could be used more generally.
+// We actually compute C * P1 + D * (2^33 * p_256k1 - P2)
+
+#define cmsub_p256k1(P0,C,P1,D,P2)              \
+        movq    $0xfffff85e00000000, %r9 ;         \
+        subq    P2, %r9 ;                       \
+        movq    $0xfffffffffffffffd, %r10 ;        \
+        sbbq    8+P2, %r10 ;                    \
+        movq    $0xffffffffffffffff, %r11 ;        \
+        sbbq    16+P2, %r11 ;                   \
+        movq    $0xffffffffffffffff, %r12 ;        \
+        sbbq    24+P2, %r12 ;                   \
+        movq    $0x00000001ffffffff, %r13 ;        \
+        sbbq    32+P2, %r13 ;                   \
+        movq    $D, %rcx ;                         \
+        movq    %r9, %rax ;                        \
+        mulq    %rcx;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        xorl    %r10d, %r10d ;                     \
+        mulq    %rcx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        xorl    %r11d, %r11d ;                     \
+        mulq    %rcx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        movq    %r12, %rax ;                       \
+        xorl    %r12d, %r12d ;                     \
+        mulq    %rcx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        imulq   %r13, %rcx ;                       \
+        addq    %rcx, %r12 ;                       \
+        movq    $C, %rcx ;                         \
+        movq    P1, %rax ;                      \
+        mulq    %rcx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        movq    0x20+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        addq    %rax, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                  \
+        movq    $0x1000003d1, %rcx ;                \
+        mulq    %rcx;                            \
+        xorl    %ebx, %ebx ;                       \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        adcq    %rbx, %r10 ;                        \
+        adcq    %rbx, %r11 ;                        \
+        cmovbq  %rbx, %rcx ;                        \
+        subq    %rcx, %r8 ;                         \
+        movq    %r8, P0 ;                        \
+        sbbq    %rbx, %r9 ;                         \
+        movq    %r9, 8+P0 ;                      \
+        sbbq    %rbx, %r10 ;                        \
+        movq    %r10, 16+P0 ;                    \
+        sbbq    %rbx, %r11 ;                        \
+        movq    %r11, 24+P0 ;                    \
+
+// P0 = 3 * P1 - 8 * P2 with 5-digit P1 and P2
+// We actually compute 3 * P1 + (2^33 * p_256k1 - P2) << 3
+
+#define cmsub38_p256k1(P0,P1,P2)                \
+        movq    $0xfffff85e00000000, %r8 ;         \
+        subq    P2, %r8 ;                       \
+        movq    $0xfffffffffffffffd, %r9 ;         \
+        sbbq    8+P2, %r9 ;                     \
+        movq    $0xffffffffffffffff, %r10 ;        \
+        sbbq    16+P2, %r10 ;                   \
+        movq    $0xffffffffffffffff, %r11 ;        \
+        sbbq    24+P2, %r11 ;                   \
+        movq    $0x00000001ffffffff, %r12 ;        \
+        sbbq    32+P2, %r12 ;                   \
+        shldq   $3, %r11, %r12 ;                    \
+        shldq   $3, %r10, %r11 ;                    \
+        shldq   $3, %r9, %r10 ;                     \
+        shldq   $3, %r8, %r9 ;                      \
+        shlq    $3, %r8 ;                          \
+        movl    $3, %ecx ;                         \
+        movq    P1, %rax ;                      \
+        mulq    %rcx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        movq    0x20+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        addq    %rax, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                  \
+        movq    $0x1000003d1, %rcx ;                \
+        mulq    %rcx;                            \
+        xorl    %ebx, %ebx ;                       \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        adcq    %rbx, %r10 ;                        \
+        adcq    %rbx, %r11 ;                        \
+        cmovbq  %rbx, %rcx ;                        \
+        subq    %rcx, %r8 ;                         \
+        movq    %r8, P0 ;                        \
+        sbbq    %rbx, %r9 ;                         \
+        movq    %r9, 8+P0 ;                      \
+        sbbq    %rbx, %r10 ;                        \
+        movq    %r10, 16+P0 ;                    \
+        sbbq    %rbx, %r11 ;                        \
+        movq    %r11, 24+P0 ;                    \
+
+// P0 = 4 * P1 - P2 with 5-digit P1, 4-digit P2 and result.
+// This is done by direct subtraction of P2 since the method
+// in bignum_cmul_p256k1 etc. for quotient estimation still
+// works when the value to be reduced is negative, as
+// long as it is  > -p_256k1, which is the case here.
+
+#define cmsub41_p256k1(P0,P1,P2)                \
+        movq    32+P1, %r12 ;                   \
+        movq    24+P1, %r11 ;                   \
+        shldq   $2, %r11, %r12 ;                    \
+        movq    16+P1, %r10 ;                   \
+        shldq   $2, %r10, %r11 ;                    \
+        movq    8+P1, %r9 ;                     \
+        shldq   $2, %r9, %r10 ;                     \
+        movq    P1, %r8 ;                       \
+        shldq   $2, %r8, %r9 ;                      \
+        shlq    $2, %r8 ;                          \
+        subq    P2, %r8 ;                       \
+        sbbq    8+P2, %r9 ;                     \
+        sbbq    16+P2, %r10 ;                   \
+        sbbq    24+P2, %r11 ;                   \
+        sbbq    $0, %r12 ;                         \
+        leaq    0x1(%r12), %rax ;                  \
+        movq    $0x1000003d1, %rcx ;                \
+        mulq    %rcx;                            \
+        xorq    %rbx, %rbx ;                       \
+        addq    %rax, %r8 ;                         \
+        adcq    %rdx, %r9 ;                         \
+        adcq    $0x0, %r10 ;                        \
+        adcq    $0x0, %r11 ;                        \
+        cmovbq  %rbx, %rcx ;                        \
+        subq    %rcx, %r8 ;                         \
+        movq    %r8, P0 ;                        \
+        sbbq    %rbx, %r9 ;                         \
+        movq    %r9, 8+P0 ;                      \
+        sbbq    %rbx, %r10 ;                        \
+        movq    %r10, 16+P0 ;                    \
+        sbbq    %rbx, %r11 ;                        \
+        movq    %r11, 24+P0 ;                    \
+
+S2N_BN_SYMBOL(secp256k1_jdouble_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room on stack for temporary variables
+
+        pushq  %rbx
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+// Main sequence of operations
+
+        // y_2 = y^2
+
+        sqr_p256k1(y_2,y_1)
+
+        // x_2 = x^2
+
+        sqr_p256k1(x_2,x_1)
+
+        // tmp = 2 * y_1 (in 4 words but not fully normalized)
+
+        weakdouble_p256k1(tmp,y_1)
+
+        // xy2 = x * y^2 (5-digit partially reduced)
+        // x_4 = x^4 (5-digit partially reduced)
+
+        roughmul_p256k1(xy2,x_1,y_2)
+        roughsqr_p256k1(x_4,x_2)
+
+        // z_3 = 2 * y_1 * z_1
+
+        mul_p256k1(z_3,z_1,tmp)
+
+        // d = 12 * xy2 - 9 * x_4
+
+        cmsub_p256k1(d,12,xy2,9,x_4)
+
+        // y4 = y2^2 (5-digit partially reduced)
+
+        roughsqr_p256k1(y_4,y_2)
+
+        // dx2 = d * x_2 (5-digit partially reduced)
+
+        roughmul_p256k1(dx2,x_2,d)
+
+        // x_3 = 4 * xy2 - d
+
+        cmsub41_p256k1(x_3,xy2,d)
+
+        // y_3 = 3 * dx2 - 8 * y_4
+
+        cmsub38_p256k1(y_3,dx2,y_4)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd.S
new file mode 100644
index 00000000000..561b645e4dc
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd.S
@@ -0,0 +1,397 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jmixadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity. It is assumed that
+// all the coordinates of the input points p1 and p2 are fully reduced
+// mod p_256k1, that the z coordinate of p1 is nonzero and that neither
+// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine
+// point as".
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jmixadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jmixadd)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// all of which are maintained throughout the code.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+#define z_2 (2*NUMSIZE)(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds exactly to bignum_mul_p256k1
+
+#define mul_p256k1(P0,P1,P2)                      \
+        xorl   %ecx, %ecx ;                         \
+        movq   P2, %rdx ;                        \
+        mulxq  P1, %r8, %r9 ;                      \
+        mulxq  0x8+P1, %rax, %r10 ;                \
+        addq   %rax, %r9 ;                          \
+        mulxq  0x10+P1, %rax, %r11 ;               \
+        adcq   %rax, %r10 ;                         \
+        mulxq  0x18+P1, %rax, %r12 ;               \
+        adcq   %rax, %r11 ;                         \
+        adcq   %rcx, %r12 ;                         \
+        xorl   %ecx, %ecx ;                         \
+        movq   0x8+P2, %rdx ;                    \
+        mulxq  P1, %rax, %rbx ;                    \
+        adcxq  %rax, %r9 ;                          \
+        adoxq  %rbx, %r10 ;                         \
+        mulxq  0x8+P1, %rax, %rbx ;                \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rbx, %r11 ;                         \
+        mulxq  0x10+P1, %rax, %rbx ;               \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rbx, %r12 ;                         \
+        mulxq  0x18+P1, %rax, %r13 ;               \
+        adcxq  %rax, %r12 ;                         \
+        adoxq  %rcx, %r13 ;                         \
+        adcxq  %rcx, %r13 ;                         \
+        xorl   %ecx, %ecx ;                         \
+        movq   0x10+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                    \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rbx, %r11 ;                         \
+        mulxq  0x8+P1, %rax, %rbx ;                \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rbx, %r12 ;                         \
+        mulxq  0x10+P1, %rax, %rbx ;               \
+        adcxq  %rax, %r12 ;                         \
+        adoxq  %rbx, %r13 ;                         \
+        mulxq  0x18+P1, %rax, %r14 ;               \
+        adcxq  %rax, %r13 ;                         \
+        adoxq  %rcx, %r14 ;                         \
+        adcxq  %rcx, %r14 ;                         \
+        xorl   %ecx, %ecx ;                         \
+        movq   0x18+P2, %rdx ;                   \
+        mulxq  P1, %rax, %rbx ;                    \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rbx, %r12 ;                         \
+        mulxq  0x8+P1, %rax, %rbx ;                \
+        adcxq  %rax, %r12 ;                         \
+        adoxq  %rbx, %r13 ;                         \
+        mulxq  0x10+P1, %rax, %rbx ;               \
+        adcxq  %rax, %r13 ;                         \
+        adoxq  %rbx, %r14 ;                         \
+        mulxq  0x18+P1, %rax, %r15 ;               \
+        adcxq  %rax, %r14 ;                         \
+        adoxq  %rcx, %r15 ;                         \
+        adcxq  %rcx, %r15 ;                         \
+        movabs $0x1000003d1, %rdx ;                 \
+        xorl   %ecx, %ecx ;                         \
+        mulxq  %r12, %rax, %rbx ;                     \
+        adcxq  %rax, %r8 ;                          \
+        adoxq  %rbx, %r9 ;                          \
+        mulxq  %r13, %rax, %rbx ;                     \
+        adcxq  %rax, %r9 ;                          \
+        adoxq  %rbx, %r10 ;                         \
+        mulxq  %r14, %rax, %rbx ;                     \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rbx, %r11 ;                         \
+        mulxq  %r15, %rax, %r12 ;                     \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rcx, %r12 ;                         \
+        adcxq  %rcx, %r12 ;                         \
+        leaq   0x1(%r12), %rax ;                   \
+        mulxq  %rax, %rax, %rbx ;                     \
+        addq   %rax, %r8 ;                          \
+        adcq   %rbx, %r9 ;                          \
+        adcq   %rcx, %r10 ;                         \
+        adcq   %rcx, %r11 ;                         \
+        cmovbq %rcx, %rdx ;                         \
+        subq   %rdx, %r8 ;                          \
+        sbbq   %rcx, %r9 ;                          \
+        sbbq   %rcx, %r10 ;                         \
+        sbbq   %rcx, %r11 ;                         \
+        movq   %r8, P0 ;                         \
+        movq   %r9, 0x8+P0 ;                     \
+        movq   %r10, 0x10+P0 ;                   \
+        movq   %r11, 0x18+P0
+
+// Corresponds exactly to bignum_sqr_p256k1
+
+#define sqr_p256k1(P0,P1)                         \
+        movq   P1, %rdx ;                        \
+        mulxq  %rdx, %r8, %r15 ;                      \
+        mulxq  0x8+P1, %r9, %r10 ;                 \
+        mulxq  0x18+P1, %r11, %r12 ;               \
+        movq   0x10+P1, %rdx ;                   \
+        mulxq  0x18+P1, %r13, %r14 ;               \
+        xorl   %ebx, %ebx ;                         \
+        mulxq  P1, %rax, %rcx ;                    \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rcx, %r11 ;                         \
+        mulxq  0x8+P1, %rax, %rcx ;                \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rcx, %r12 ;                         \
+        movq   0x18+P1, %rdx ;                   \
+        mulxq  0x8+P1, %rax, %rcx ;                \
+        adcxq  %rax, %r12 ;                         \
+        adoxq  %rcx, %r13 ;                         \
+        adcxq  %rbx, %r13 ;                         \
+        adoxq  %rbx, %r14 ;                         \
+        adcq   %rbx, %r14 ;                         \
+        xorl   %ebx, %ebx ;                         \
+        adcxq  %r9, %r9 ;                           \
+        adoxq  %r15, %r9 ;                          \
+        movq   0x8+P1, %rdx ;                    \
+        mulxq  %rdx, %rax, %rdx ;                     \
+        adcxq  %r10, %r10 ;                         \
+        adoxq  %rax, %r10 ;                         \
+        adcxq  %r11, %r11 ;                         \
+        adoxq  %rdx, %r11 ;                         \
+        movq   0x10+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %rdx ;                     \
+        adcxq  %r12, %r12 ;                         \
+        adoxq  %rax, %r12 ;                         \
+        adcxq  %r13, %r13 ;                         \
+        adoxq  %rdx, %r13 ;                         \
+        movq   0x18+P1, %rdx ;                   \
+        mulxq  %rdx, %rax, %r15 ;                     \
+        adcxq  %r14, %r14 ;                         \
+        adoxq  %rax, %r14 ;                         \
+        adcxq  %rbx, %r15 ;                         \
+        adoxq  %rbx, %r15 ;                         \
+        movabs $0x1000003d1, %rdx ;                 \
+        xorl   %ebx, %ebx ;                         \
+        mulxq  %r12, %rax, %rcx ;                     \
+        adcxq  %rax, %r8 ;                          \
+        adoxq  %rcx, %r9 ;                          \
+        mulxq  %r13, %rax, %rcx ;                     \
+        adcxq  %rax, %r9 ;                          \
+        adoxq  %rcx, %r10 ;                         \
+        mulxq  %r14, %rax, %rcx ;                     \
+        adcxq  %rax, %r10 ;                         \
+        adoxq  %rcx, %r11 ;                         \
+        mulxq  %r15, %rax, %r12 ;                     \
+        adcxq  %rax, %r11 ;                         \
+        adoxq  %rbx, %r12 ;                         \
+        adcxq  %rbx, %r12 ;                         \
+        leaq   0x1(%r12), %rax ;                   \
+        mulxq  %rax, %rax, %rcx ;                     \
+        addq   %rax, %r8 ;                          \
+        adcq   %rcx, %r9 ;                          \
+        adcq   %rbx, %r10 ;                         \
+        adcq   %rbx, %r11 ;                         \
+        sbbq   %rax, %rax ;                         \
+        notq   %rax;                             \
+        andq   %rdx, %rax ;                         \
+        subq   %rax, %r8 ;                          \
+        sbbq   %rbx, %r9 ;                          \
+        sbbq   %rbx, %r10 ;                         \
+        sbbq   %rbx, %r11 ;                         \
+        movq   %r8, P0 ;                         \
+        movq   %r9, 0x8+P0 ;                     \
+        movq   %r10, 0x10+P0 ;                   \
+        movq   %r11, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256k1
+
+#define sub_p256k1(P0,P1,P2)                      \
+        xorl   %eax, %eax ;                         \
+        movq   P1, %r8 ;                         \
+        subq   P2, %r8 ;                         \
+        movq   0x8+P1, %r9 ;                     \
+        sbbq   0x8+P2, %r9 ;                     \
+        movq   0x10+P1, %r10 ;                   \
+        sbbq   0x10+P2, %r10 ;                   \
+        movq   0x18+P1, %r11 ;                   \
+        sbbq   0x18+P2, %r11 ;                   \
+        movabs $0x1000003d1, %rcx ;                 \
+        cmovae %rax, %rcx ;                         \
+        subq   %rcx, %r8 ;                          \
+        movq   %r8, P0 ;                         \
+        sbbq   %rax, %r9 ;                          \
+        movq   %r9, 0x8+P0 ;                     \
+        sbbq   %rax, %r10 ;                         \
+        movq   %r10, 0x10+P0 ;                   \
+        sbbq   %rax, %r11 ;                         \
+        movq   %r11, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define testzero4(P)                            \
+        movq    P, %rax ;                       \
+        movq    8+P, %rdx ;                     \
+        orq     16+P, %rax ;                    \
+        orq     24+P, %rdx ;                    \
+        orq     %rdx, %rax
+
+#define mux4(r0,r1,r2,r3,PNE,PEQ)               \
+        movq    PNE, r0 ;                      \
+        movq    PEQ, %rax ;                     \
+        cmovzq  %rax, r0 ;                        \
+        movq    8+PNE, r1 ;                    \
+        movq    8+PEQ, %rax ;                   \
+        cmovzq  %rax, r1 ;                        \
+        movq    16+PNE, r2 ;                   \
+        movq    16+PEQ, %rax ;                  \
+        cmovzq  %rax, r2 ;                        \
+        movq    24+PNE, r3 ;                   \
+        movq    24+PEQ, %rax ;                  \
+        cmovzq  %rax, r3
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+S2N_BN_SYMBOL(secp256k1_jmixadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it stays
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq   $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p256k1(zp2,z_1)
+
+        mul_p256k1(y2a,z_1,y_2)
+        mul_p256k1(x2a,zp2,x_2)
+        mul_p256k1(y2a,zp2,y2a)
+
+        sub_p256k1(xd,x2a,x_1)
+
+        sub_p256k1(yd,y2a,y_1)
+
+        sqr_p256k1(zz,xd)
+        sqr_p256k1(ww,yd)
+
+        mul_p256k1(zzx1,zz,x_1)
+        mul_p256k1(zzx2,zz,x2a)
+
+        sub_p256k1(resx,ww,zzx1)
+        sub_p256k1(t1,zzx2,zzx1)
+
+        mul_p256k1(resz,xd,z_1)
+
+        sub_p256k1(resx,resx,zzx2)
+
+        sub_p256k1(t2,zzx1,resx)
+
+        mul_p256k1(t1,t1,y_1)
+        mul_p256k1(t2,yd,t2)
+
+        sub_p256k1(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        testzero4(z_1)
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with an extra z = 1
+// coordinate, hence giving 0 + p2 = p2 for the final result.
+
+        mux4(%r8,%r9,%r10,%r11,resx,x_2)
+        mux4(%r12,%r13,%r14,%r15,resy,y_2)
+
+        store4(x_3,%r8,%r9,%r10,%r11)
+        store4(y_3,%r12,%r13,%r14,%r15)
+
+        load4(%r8,%r9,%r10,%r11,resz)
+        movl    $1, %eax
+        cmovzq  %rax, %r8
+        movl    $0, %eax
+        cmovzq  %rax, %r9
+        cmovzq  %rax, %r10
+        cmovzq  %rax, %r11
+
+        store4(z_3,%r8,%r9,%r10,%r11)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd_alt.S
new file mode 100644
index 00000000000..8e91773c638
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd_alt.S
@@ -0,0 +1,478 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates
+//
+//    extern void secp256k1_jmixadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples.
+// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity. It is assumed that
+// all the coordinates of the input points p1 and p2 are fully reduced
+// mod p_256k1, that the z coordinate of p1 is nonzero and that neither
+// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine
+// point as".
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jmixadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jmixadd_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// all of which are maintained throughout the code.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+#define z_2 (2*NUMSIZE)(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_mul_p256k1_alt except %rsi -> %rbx
+
+#define mul_p256k1(P0,P1,P2)                    \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    $0x0, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    $0x1000003d1, %rbx ;               \
+        movq    %r12, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r13, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r14, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r15, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        xorq    %rcx, %rcx ;                       \
+        addq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                 \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rax, %rax ;                       \
+        notq    %rax;                            \
+        andq    %rbx, %rax ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Corresponds to bignum_sqr_p256k1_alt except for %rsi -> %rbx
+
+#define sqr_p256k1(P0,P1)                       \
+        movq    P1, %rax ;                      \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P1;             \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r11 ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    $0x0, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r12 ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r13 ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r13 ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r14 ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P1;            \
+        addq    %rax, %rax ;                       \
+        adcq    %rdx, %rdx ;                       \
+        adcq    $0x0, %r15 ;                       \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    $0x1000003d1, %rbx ;               \
+        movq    %r12, %rax ;                       \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r13, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r14, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    %r15, %rax ;                       \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        xorq    %rcx, %rcx ;                       \
+        addq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        leaq    0x1(%r12), %rax ;                 \
+        mulq    %rbx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        adcq    %rcx, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rax, %rax ;                       \
+        notq    %rax;                            \
+        andq    %rbx, %rax ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        movq    %r8, P0 ;                       \
+        movq    %r9, 0x8+P0 ;                   \
+        movq    %r10, 0x10+P0 ;                 \
+        movq    %r11, 0x18+P0
+
+// Corresponds exactly to bignum_sub_p256k1
+
+#define sub_p256k1(P0,P1,P2)                    \
+        xorl   %eax, %eax ;                         \
+        movq   P1, %r8 ;                         \
+        subq   P2, %r8 ;                         \
+        movq   0x8+P1, %r9 ;                     \
+        sbbq   0x8+P2, %r9 ;                     \
+        movq   0x10+P1, %r10 ;                   \
+        sbbq   0x10+P2, %r10 ;                   \
+        movq   0x18+P1, %r11 ;                   \
+        sbbq   0x18+P2, %r11 ;                   \
+        movabs $0x1000003d1, %rcx ;                 \
+        cmovae %rax, %rcx ;                         \
+        subq   %rcx, %r8 ;                          \
+        movq   %r8, P0 ;                         \
+        sbbq   %rax, %r9 ;                          \
+        movq   %r9, 0x8+P0 ;                     \
+        sbbq   %rax, %r10 ;                         \
+        movq   %r10, 0x10+P0 ;                   \
+        sbbq   %rax, %r11 ;                         \
+        movq   %r11, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define testzero4(P)                            \
+        movq    P, %rax ;                       \
+        movq    8+P, %rdx ;                     \
+        orq     16+P, %rax ;                    \
+        orq     24+P, %rdx ;                    \
+        orq     %rdx, %rax
+
+#define mux4(r0,r1,r2,r3,PNE,PEQ)               \
+        movq    PNE, r0 ;                      \
+        movq    PEQ, %rax ;                     \
+        cmovzq  %rax, r0 ;                        \
+        movq    8+PNE, r1 ;                    \
+        movq    8+PEQ, %rax ;                   \
+        cmovzq  %rax, r1 ;                        \
+        movq    16+PNE, r2 ;                   \
+        movq    16+PEQ, %rax ;                  \
+        cmovzq  %rax, r2 ;                        \
+        movq    24+PNE, r3 ;                   \
+        movq    24+PEQ, %rax ;                  \
+        cmovzq  %rax, r3
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+S2N_BN_SYMBOL(secp256k1_jmixadd_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it stays
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq   $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+
+        sqr_p256k1(zp2,z_1)
+
+        mul_p256k1(y2a,z_1,y_2)
+        mul_p256k1(x2a,zp2,x_2)
+        mul_p256k1(y2a,zp2,y2a)
+
+        sub_p256k1(xd,x2a,x_1)
+
+        sub_p256k1(yd,y2a,y_1)
+
+        sqr_p256k1(zz,xd)
+        sqr_p256k1(ww,yd)
+
+        mul_p256k1(zzx1,zz,x_1)
+        mul_p256k1(zzx2,zz,x2a)
+
+        sub_p256k1(resx,ww,zzx1)
+        sub_p256k1(t1,zzx2,zzx1)
+
+        mul_p256k1(resz,xd,z_1)
+
+        sub_p256k1(resx,resx,zzx2)
+
+        sub_p256k1(t2,zzx1,resx)
+
+        mul_p256k1(t1,t1,y_1)
+        mul_p256k1(t2,yd,t2)
+
+        sub_p256k1(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        testzero4(z_1)
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with an extra z = 1
+// coordinate, hence giving 0 + p2 = p2 for the final result.
+
+        mux4(%r8,%r9,%r10,%r11,resx,x_2)
+        mux4(%r12,%r13,%r14,%r15,resy,y_2)
+
+        store4(x_3,%r8,%r9,%r10,%r11)
+        store4(y_3,%r12,%r13,%r14,%r15)
+
+        load4(%r8,%r9,%r10,%r11,resz)
+        movl    $1, %eax
+        cmovzq  %rax, %r8
+        movl    $0, %eax
+        cmovzq  %rax, %r9
+        cmovzq  %rax, %r10
+        cmovzq  %rax, %r11
+
+        store4(z_3,%r8,%r9,%r10,%r11)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_add_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_add_sm2.S
new file mode 100644
index 00000000000..3edff95c3e1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_add_sm2.S
@@ -0,0 +1,100 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_add_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+#define y %rdx
+
+#define d0 %rax
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %rdx
+#define c %r11
+
+#define n1short %r10d
+
+
+
+S2N_BN_SYMBOL(bignum_add_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Load and add the two inputs as 2^256 * c + [d3;d2;d1;d0] = x + y
+
+        xorq    c, c
+        movq    (x), d0
+        addq    (y), d0
+        movq    8(x), d1
+        adcq    8(y), d1
+        movq    16(x), d2
+        adcq    16(y), d2
+        movq    24(x), d3
+        adcq    24(y), d3
+        adcq    c, c
+
+// Now subtract 2^256 * c + [d3;d3;d1;d1] = x + y - p_sm2
+// The constants n1 and n3 in [n3; 0; n1; -1] = p_sm2 are saved for later
+
+        subq    $-1, d0
+        movq    $0xffffffff00000000, n1
+        sbbq    n1, d1
+        sbbq    $-1, d2
+        movq    $0xfffffffeffffffff, n3
+        sbbq    n3, d3
+
+// Since by hypothesis x < p_sm2 we know x + y - p_sm2 < 2^256, so the top
+// carry c actually gives us a bitmask for x + y - p_sm2 < 0, which we
+// now use to make a masked p_sm2' = [n3; 0; n1; c]
+
+        sbbq    $0, c
+        andq    c, n1
+        andq    c, n3
+
+// Do the corrective addition and copy to output
+
+        addq    c, d0
+        movq    d0, (z)
+        adcq    n1, d1
+        movq    d1, 8(z)
+        adcq    c, d2
+        movq    d2, 16(z)
+        adcq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2.S
new file mode 100644
index 00000000000..e4c2caf3869
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2.S
@@ -0,0 +1,133 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_sm2
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = c, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_sm2)
+        .text
+
+#define z %rdi
+
+// Temporarily moved here for initial multiply
+#define x %rcx
+// Likewise this is thrown away after initial multiply
+#define m %rdx
+
+#define a %rax
+#define c %rcx
+
+#define d0 %rsi
+#define d1 %r8
+#define d2 %r9
+#define d3 %r10
+#define h %r11
+
+// Multiplier again for second stage
+#define q %rdx
+#define qshort %edx
+
+S2N_BN_SYMBOL(bignum_cmul_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Shuffle inputs (since we want multiplier in %rdx)
+
+        movq    %rdx, x
+        movq    %rsi, m
+
+// Multiply, accumulating the result as ca = 2^256 * h + [d3;d2;d1;d0]
+
+        mulxq   (x), d0, d1
+        mulxq   8(x), a, d2
+        addq    a, d1
+        mulxq   16(x), a, d3
+        adcq    a, d2
+        mulxq   24(x), a, h
+        adcq    a, d3
+        adcq    $0, h
+
+// Quotient approximation is (h * (1 + 2^32 + 2^64) + d3 + 2^64) >> 64.
+// Note that by hypothesis our product is <= (2^64 - 1) * (p_sm2 - 1),
+// so there is no need to max this out to avoid wrapping, unlike in the
+// more general case of bignum_mod_sm2.
+
+        movq    d3, a
+        movl    $1, qshort
+        addq    h, a
+        adcq    h, q
+
+        shrq    $32, a
+        addq    h, a
+
+        shrq    $32, a
+        addq    a, q
+
+// Now compute the initial pre-reduced [h;d3;d2;d1;d0] = ca - p_sm2 * q
+// = ca - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q
+
+        movq    q, a
+        movq    q, c
+        shlq    $32, a
+        shrq    $32, c
+
+        addq    a, d3
+        adcq    c, h
+
+        subq    q, a
+        sbbq    $0, c
+
+        subq    q, h
+
+        addq    q, d0
+        adcq    a, d1
+        adcq    c, d2
+        adcq    $0, d3
+        adcq    $0, h
+
+// Now our top word h is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative, as a bitmask
+// Do a masked addition of p_sm2
+
+        movq    $0xffffffff00000000, a
+        andq    h, a
+        movq    $0xfffffffeffffffff, c
+        andq    h, c
+        addq    h, d0
+        movq    d0, (z)
+        adcq    a, d1
+        movq    d1, 8(z)
+        adcq    h, d2
+        movq    d2, 16(z)
+        adcq    c, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2_alt.S
new file mode 100644
index 00000000000..770d83e9ed5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2_alt.S
@@ -0,0 +1,150 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming
+// x reduced
+// Inputs c, x[4]; output z[4]
+//
+//    extern void bignum_cmul_sm2_alt
+//     (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = c, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_sm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_sm2_alt)
+        .text
+
+#define z %rdi
+
+// Temporarily moved here for initial multiply then thrown away
+
+#define x %rcx
+#define m %rsi
+
+// Other variables
+
+#define d %rdx
+#define a %rax
+#define c %rcx
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+#define h %rsi
+
+#define hshort %esi
+
+// Multiplier again for second stage
+#define q %rdx
+#define qshort %edx
+
+S2N_BN_SYMBOL(bignum_cmul_sm2_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Shuffle inputs (since we want %rdx for the high parts of products)
+
+        movq    %rdx, x
+
+// Multiply, accumulating the result as ca = 2^256 * h + [d3;d2;d1;d0]
+
+        movq    (x), a
+        mulq    m
+        movq    a, d0
+        movq    d, d1
+
+        movq    8(x), a
+        mulq    m
+        xorq    d2, d2
+        addq    a, d1
+        adcq    d, d2
+
+        movq    16(x), a
+        mulq    m
+        xorq    d3, d3
+        addq    a, d2
+        adcq    d, d3
+
+        movq    24(x), a
+        mulq    m
+        xorl    hshort, hshort
+        addq    a, d3
+        adcq    d, h
+
+// Quotient approximation is (h * (1 + 2^32 + 2^64) + d3 + 2^64) >> 64.
+// Note that by hypothesis our product is <= (2^64 - 1) * (p_sm2 - 1),
+// so there is no need to max this out to avoid wrapping, unlike in the
+// more general case of bignum_mod_sm2.
+
+        movq    d3, a
+        movl    $1, qshort
+        addq    h, a
+        adcq    h, q
+
+        shrq    $32, a
+        addq    h, a
+        shrq    $32, a
+        addq    a, q
+
+// Now compute the initial pre-reduced [h;d3;d2;d1;d0] = ca - p_sm2 * q
+// = ca - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q
+
+        movq    q, a
+        movq    q, c
+        shlq    $32, a
+        shrq    $32, c
+
+        addq    a, d3
+        adcq    c, h
+
+        subq    q, a
+        sbbq    $0, c
+
+        subq    q, h
+
+        addq    q, d0
+        adcq    a, d1
+        adcq    c, d2
+        adcq    $0, d3
+        adcq    $0, h
+
+// Now our top word h is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative, as a bitmask
+// Do a masked addition of p_sm2
+
+        movq    $0xffffffff00000000, a
+        andq    h, a
+        movq    $0xfffffffeffffffff, c
+        andq    h, c
+        addq    h, d0
+        movq    d0, (z)
+        adcq    a, d1
+        movq    d1, 8(z)
+        adcq    h, d2
+        movq    d2, 16(z)
+        adcq    c, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_deamont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_deamont_sm2.S
new file mode 100644
index 00000000000..fa0bc34eed5
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_deamont_sm2.S
@@ -0,0 +1,119 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from almost-Montgomery form, z := (x / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_deamont_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form,
+// "almost" meaning any 4-digit input will work, with no range restriction.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define c %rcx
+#define n1 %rax
+#define n3 %rdx
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rsi
+// as temporaries.
+// ---------------------------------------------------------------------------
+
+#define montreds(d3,d2,d1,d0)                                               \
+        movq    d0, %rax ;                                                    \
+        shlq    $32, %rax ;                                                    \
+        movq    d0, %rcx ;                                                    \
+        shrq    $32, %rcx ;                                                    \
+        movq    %rax, %rdx ;                                                   \
+        movq    %rcx, %rsi ;                                                   \
+        subq    d0, %rax ;                                                    \
+        sbbq    $0, %rcx ;                                                     \
+        subq    %rax, d1 ;                                                    \
+        sbbq    %rcx, d2 ;                                                    \
+        sbbq    %rdx, d3 ;                                                    \
+        sbbq    %rsi, d0
+
+S2N_BN_SYMBOL(bignum_deamont_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x
+
+        movq    (x), %r8
+        movq    8(x), %r9
+        movq    16(x), %r10
+        movq    24(x), %r11
+
+// Systematically scroll left doing 1-step reductions. This process
+// keeps things inside 4 digits (i.e. < 2^256) at each stage, since
+// we have w * p_sm2 + x <= (2^64 - 1) * p_sm2 + (2 EXP 256 - 1)
+// <= (2^64 - 1) * (2^256 - 1) + (2 EXP 256 - 1) <= 2^64 * (2^256 - 1)
+
+        montreds(%r11,%r10,%r9,%r8)
+
+        montreds(%r8,%r11,%r10,%r9)
+
+        montreds(%r9,%r8,%r11,%r10)
+
+        montreds(%r10,%r9,%r8,%r11)
+
+// Let [%r11;%r10;%r9;%r8] := [%r11;%r10;%r9;%r8] - p_sm2, saving constants
+// n1 and n3 in [n3; -1; n1; -1] = p_sm2 for later use.
+
+        subq    $-1, %r8
+        movq    $0xffffffff00000000, n1
+        sbbq    n1, %r9
+        sbbq    $-1, %r10
+        movq    $0xfffffffeffffffff, n3
+        sbbq    n3, %r11
+
+// Capture the carry to determine whether to add back p_sm2, and use
+// it to create a masked p_sm2' = [n3; c; n1; c]
+
+        sbbq    c, c
+        andq    c, n1
+        andq    c, n3
+
+// Do the corrective addition and copy to output
+
+        addq    c, %r8
+        movq    %r8, (z)
+        adcq    n1, %r9
+        movq    %r9, 8(z)
+        adcq    c, %r10
+        movq    %r10, 16(z)
+        adcq    n3, %r11
+        movq    %r11, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_demont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_demont_sm2.S
new file mode 100644
index 00000000000..360a4e50811
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_demont_sm2.S
@@ -0,0 +1,93 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_demont_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// This assumes the input is < p_sm2 for correctness. If this is not the case,
+// use the variant "bignum_deamont_sm2" instead.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rsi
+// as temporaries.
+// ---------------------------------------------------------------------------
+
+#define montreds(d3,d2,d1,d0)                                               \
+        movq    d0, %rax ;                                                    \
+        shlq    $32, %rax ;                                                    \
+        movq    d0, %rcx ;                                                    \
+        shrq    $32, %rcx ;                                                    \
+        movq    %rax, %rdx ;                                                   \
+        movq    %rcx, %rsi ;                                                   \
+        subq    d0, %rax ;                                                    \
+        sbbq    $0, %rcx ;                                                     \
+        subq    %rax, d1 ;                                                    \
+        sbbq    %rcx, d2 ;                                                    \
+        sbbq    %rdx, d3 ;                                                    \
+        sbbq    %rsi, d0
+
+S2N_BN_SYMBOL(bignum_demont_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x
+
+        movq    (x), %r8
+        movq    8(x), %r9
+        movq    16(x), %r10
+        movq    24(x), %r11
+
+// Systematically scroll left doing 1-step reductions. This process
+// keeps things reduced < p_sm2 at each stage, since we have
+// w * p_sm2 + x <= (2^64 - 1) * p_sm2 + (p_sm2 - 1) < 2^64 * p_sm2
+
+        montreds(%r11,%r10,%r9,%r8)
+
+        montreds(%r8,%r11,%r10,%r9)
+
+        montreds(%r9,%r8,%r11,%r10)
+
+        montreds(%r10,%r9,%r8,%r11)
+
+// Write back result
+
+        movq    %r8, (z)
+        movq    %r9, 8(z)
+        movq    %r10, 16(z)
+        movq    %r11, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_double_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_double_sm2.S
new file mode 100644
index 00000000000..857a0675308
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_double_sm2.S
@@ -0,0 +1,97 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_double_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %r11
+#define c %rax
+
+#define n1short %r10d
+
+S2N_BN_SYMBOL(bignum_double_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the input and double it so that 2^256 * c + [d3;d2;d1;d0] = 2 * x
+// Could also consider using shld to decouple carries
+
+        xorq    c, c
+        movq    (x), d0
+        addq    d0, d0
+        movq    8(x), d1
+        adcq    d1, d1
+        movq    16(x), d2
+        adcq    d2, d2
+        movq    24(x), d3
+        adcq    d3, d3
+        adcq    c, c
+
+// Now subtract 2^256 * c + [d3;d3;d1;d1] = 2 * x - p_sm2
+// The constants n1 and n3 in [n3; -1; n1; -1] = p_sm2 are saved for later
+
+        subq    $-1, d0
+        movq    $0xffffffff00000000, n1
+        sbbq    n1, d1
+        sbbq    $-1, d2
+        movq    $0xfffffffeffffffff, n3
+        sbbq    n3, d3
+
+// Since by hypothesis x < p_sm2 we know 2 * x - p_sm2 < 2^256, so the top
+// carry c actually gives us a bitmask for 2 * x - p_sm2 < 0, which we
+// now use to make a masked p_sm2' = [n3; c; n1; c]
+
+        sbbq    $0, c
+        andq    c, n1
+        andq    c, n3
+
+// Do the corrective addition and copy to output
+
+        addq    c, d0
+        movq    d0, (z)
+        adcq    n1, d1
+        movq    d1, 8(z)
+        adcq    c, d2
+        movq    d2, 16(z)
+        adcq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_half_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_half_sm2.S
new file mode 100644
index 00000000000..b2502942b69
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_half_sm2.S
@@ -0,0 +1,89 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_half_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define a %rax
+#define d0 %rcx
+#define d1 %rdx
+#define d2 %r8
+#define d3 %r9
+
+#define d0short %ecx
+#define d1short %edx
+
+S2N_BN_SYMBOL(bignum_half_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load lowest digit and get a mask for its lowest bit in d0
+
+        movq    (x), a
+        movl    $1, d0short
+        andq    a, d0
+        negq    d0
+
+// Create a masked version of p_sm2
+
+        movq    $0xffffffff00000000, d1
+        andq    d0, d1
+        movq    d0, d2
+        movq    $0xfffffffeffffffff, d3
+        andq    d0, d3
+
+// Perform addition with masked p_sm2. Catch the carry in a, as a bitmask
+// for convenience though we only use its LSB below with SHRD
+
+        addq    a, d0
+        adcq    8(x), d1
+        adcq    16(x), d2
+        adcq    24(x), d3
+        sbbq    a, a
+
+// Shift right, pushing the carry back down, and store back
+
+        shrdq   $1, d1, d0
+        movq    d0, (z)
+        shrdq   $1, d2, d1
+        movq    d1, 8(z)
+        shrdq   $1, d3, d2
+        movq    d2, 16(z)
+        shrdq   $1, a, d3
+        movq    d3, 24(z)
+
+// Return
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_inv_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_inv_sm2.S
new file mode 100644
index 00000000000..dffa018b221
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_inv_sm2.S
@@ -0,0 +1,1629 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+//
+// extern void bignum_inv_sm2(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// If the 4-digit input x is coprime to p_sm2, i.e. is not divisible
+// by it, returns z < p_sm2 such that x * z == 1 (mod p_sm2). Note that
+// x does not need to be reduced modulo p_sm2, but the output always is.
+// If the input is divisible (i.e. is 0 or p_sm2), then there can be no
+// modular inverse and z = 0 is returned.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_sm2)
+        .text
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f 0(%rsp)
+#define g (5*N)(%rsp)
+#define u (10*N)(%rsp)
+#define v (15*N)(%rsp)
+#define tmp  (20*N)(%rsp)
+#define tmp2  (21*N)(%rsp)
+#define i  (22*N)(%rsp)
+#define d  (23*N)(%rsp)
+
+#define mat (24*N)(%rsp)
+
+// Backup for the input pointer
+
+#define res  (28*N)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (30*N)
+
+// Syntactic variants to make x86_att version simpler to generate
+
+#define F 0
+#define G (5*N)
+#define U (10*N)
+#define V (15*N)
+#define MAT (24*N)
+
+#define ff  (%rsp)
+#define gg  (5*N)(%rsp)
+
+// ---------------------------------------------------------------------------
+// Core signed almost-Montgomery reduction macro from u[4..0] to u[3..0].
+// ---------------------------------------------------------------------------
+
+#define amontred(P)                                                     \
+/* We only know the input is -2^316 < x < 2^316. To do traditional  */  \
+/* unsigned Montgomery reduction, start by adding 2^61 * p_sm2.     */  \
+        movq    $0xe000000000000000, %r8 ;                                 \
+        addq    P, %r8 ;                                                \
+        movq    $0x1fffffffffffffff, %r9 ;                                 \
+        adcq    8+P, %r9 ;                                              \
+        movq    $0xffffffffe0000000, %r10 ;                                \
+        adcq    16+P, %r10 ;                                            \
+        movq    $0xffffffffffffffff, %r11 ;                                \
+        adcq    24+P, %r11 ;                                            \
+        movq    $0x1fffffffdfffffff, %r12 ;                                \
+        adcq    32+P, %r12 ;                                            \
+/* Let [%rcx;%rbx] = 2^32 * d0 and [%rdx;%rax] = (2^32-1) * d0 */           \
+        movq    %r8, %rbx ;                                                \
+        movq    %r8, %rcx ;                                                \
+        shrq    $32, %rcx ;                                                \
+        shlq    $32, %rbx ;                                                \
+        movl    $0xffffffff, %eax ;                                        \
+        mulq    %r8;                                                     \
+/* Now [%r12;%r11;%r10;%r9] := [%r8;%r11;%r10;%r9] - [%rcx;%rbx;%rdx;%rax] */       \
+        subq    %rax, %r9 ;                                                \
+        sbbq    %rdx, %r10 ;                                               \
+        sbbq    %rbx, %r11 ;                                               \
+        sbbq    %rcx, %r8 ;                                                \
+        addq    %r8, %r12 ;                                                \
+/* Now capture carry and subtract p_sm2 if set (almost-Montgomery) */   \
+        sbbq    %rax, %rax ;                                               \
+        movl    $0xffffffff, %ebx ;                                        \
+        notq    %rbx;                                                    \
+        andq    %rax, %rbx ;                                               \
+        movq    %rax, %rdx ;                                               \
+        btr     $32, %rdx ;                                                \
+        subq    %rax, %r9 ;                                                \
+        movq    %r9, P ;                                                \
+        sbbq    %rbx, %r10 ;                                               \
+        movq    %r10, 8+P ;                                             \
+        sbbq    %rax, %r11 ;                                               \
+        movq    %r11, 16+P ;                                            \
+        sbbq    %rdx, %r12 ;                                               \
+        movq    %r12, 24+P
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix as
+//
+// [ %r8   %r10]
+// [ %r12  %r14]
+//
+// and also returning the matrix still negated (which doesn't matter)
+
+#define divstep59(din,fin,gin)                                          \
+        movq    din, %rsi ;                                               \
+        movq    fin, %rdx ;                                               \
+        movq    gin, %rcx ;                                               \
+        movq    %rdx, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        xorl    %ebp, %ebp ;                                               \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %rdx ;                                         \
+        leaq    (%rcx,%rax), %rdi ;                                         \
+        shlq    $0x16, %rdx ;                                              \
+        shlq    $0x16, %rdi ;                                              \
+        sarq    $0x2b, %rdx ;                                              \
+        sarq    $0x2b, %rdi ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %rbx ;                                         \
+        leaq    (%rcx,%rax), %rcx ;                                         \
+        sarq    $0x2a, %rbx ;                                              \
+        sarq    $0x2a, %rcx ;                                              \
+        movq    %rdx, MAT(%rsp) ;                                         \
+        movq    %rbx, MAT+0x8(%rsp) ;                                     \
+        movq    %rdi, MAT+0x10(%rsp) ;                                    \
+        movq    %rcx, MAT+0x18(%rsp) ;                                    \
+        movq    fin, %r12 ;                                               \
+        imulq   %r12, %rdi ;                                               \
+        imulq   %rdx, %r12 ;                                               \
+        movq    gin, %r13 ;                                               \
+        imulq   %r13, %rbx ;                                               \
+        imulq   %rcx, %r13 ;                                               \
+        addq    %rbx, %r12 ;                                               \
+        addq    %rdi, %r13 ;                                               \
+        sarq    $0x14, %r12 ;                                              \
+        sarq    $0x14, %r13 ;                                              \
+        movq    %r12, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        movq    %r13, %rcx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %r8 ;                                          \
+        leaq    (%rcx,%rax), %r10 ;                                         \
+        shlq    $0x16, %r8 ;                                               \
+        shlq    $0x16, %r10 ;                                              \
+        sarq    $0x2b, %r8 ;                                               \
+        sarq    $0x2b, %r10 ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %r15 ;                                         \
+        leaq    (%rcx,%rax), %r11 ;                                         \
+        sarq    $0x2a, %r15 ;                                              \
+        sarq    $0x2a, %r11 ;                                              \
+        movq    %r13, %rbx ;                                               \
+        movq    %r12, %rcx ;                                               \
+        imulq   %r8, %r12 ;                                                \
+        imulq   %r15, %rbx ;                                               \
+        addq    %rbx, %r12 ;                                               \
+        imulq   %r11, %r13 ;                                               \
+        imulq   %r10, %rcx ;                                               \
+        addq    %rcx, %r13 ;                                               \
+        sarq    $0x14, %r12 ;                                              \
+        sarq    $0x14, %r13 ;                                              \
+        movq    %r12, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        movq    %r13, %rcx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    MAT(%rsp), %rax ;                                         \
+        imulq   %r8, %rax ;                                                \
+        movq    MAT+0x10(%rsp), %rdx ;                                    \
+        imulq   %r15, %rdx ;                                               \
+        imulq   MAT+0x8(%rsp), %r8 ;                                      \
+        imulq   MAT+0x18(%rsp), %r15 ;                                    \
+        addq    %r8, %r15 ;                                                \
+        leaq    (%rax,%rdx), %r9 ;                                          \
+        movq    MAT(%rsp), %rax ;                                         \
+        imulq   %r10, %rax ;                                               \
+        movq    MAT+0x10(%rsp), %rdx ;                                    \
+        imulq   %r11, %rdx ;                                               \
+        imulq   MAT+0x8(%rsp), %r10 ;                                     \
+        imulq   MAT+0x18(%rsp), %r11 ;                                    \
+        addq    %r10, %r11 ;                                               \
+        leaq    (%rax,%rdx), %r13 ;                                         \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %r8 ;                                          \
+        leaq    (%rcx,%rax), %r12 ;                                         \
+        shlq    $0x15, %r8 ;                                               \
+        shlq    $0x15, %r12 ;                                              \
+        sarq    $0x2b, %r8 ;                                               \
+        sarq    $0x2b, %r12 ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %r10 ;                                         \
+        leaq    (%rcx,%rax), %r14 ;                                         \
+        sarq    $0x2b, %r10 ;                                              \
+        sarq    $0x2b, %r14 ;                                              \
+        movq    %r9, %rax ;                                                \
+        imulq   %r8, %rax ;                                                \
+        movq    %r13, %rdx ;                                               \
+        imulq   %r10, %rdx ;                                               \
+        imulq   %r15, %r8 ;                                                \
+        imulq   %r11, %r10 ;                                               \
+        addq    %r8, %r10 ;                                                \
+        leaq    (%rax,%rdx), %r8 ;                                          \
+        movq    %r9, %rax ;                                                \
+        imulq   %r12, %rax ;                                               \
+        movq    %r13, %rdx ;                                               \
+        imulq   %r14, %rdx ;                                               \
+        imulq   %r15, %r12 ;                                               \
+        imulq   %r11, %r14 ;                                               \
+        addq    %r12, %r14 ;                                               \
+        leaq    (%rax,%rdx), %r12
+
+S2N_BN_SYMBOL(bignum_inv_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room for temporaries
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Save the return pointer for the end so we can overwrite %rdi later
+
+        movq    %rdi, res
+
+// Create constant [%rdx;%rcx;%rbx;%rax] = p_sm2 and copy it into the variable f
+// including the 5th zero digit
+
+        xorl    %ebp, %ebp
+        leaq    -1(%rbp), %rax
+        movl    $0x00000000ffffffff, %ebx
+        notq    %rbx
+        movq    %rax, %rcx
+        movq    %rax, %rdx
+        btr     $32, %rdx
+
+        movq    %rax, F(%rsp)
+        movq    %rbx, F+8(%rsp)
+        movq    %rcx, F+16(%rsp)
+        movq    %rdx, F+24(%rsp)
+        movq    %rbp, F+32(%rsp)
+
+// Now reduce the input modulo p_sm2, first negating the constant to get
+// [%rdx;%rcx;%rbx;%rax] = 2^256 - p_sm2, adding it to x and hence getting
+// the comparison x < p_sm2 <=> (2^256 - p_sm2) + x < 2^256 and choosing
+// g accordingly.
+
+        movq    (%rsi), %r8
+        movq    8(%rsi), %r9
+        movq    16(%rsi), %r10
+        movq    24(%rsi), %r11
+
+        movl    $1, %eax
+        notq    %rbx
+        xorl    %ecx, %ecx
+        notq    %rdx
+
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+
+        cmovncq %r8, %rax
+        cmovncq %r9, %rbx
+        cmovncq %r10, %rcx
+        cmovncq %r11, %rdx
+
+        movq    %rax, G(%rsp)
+        movq    %rbx, G+8(%rsp)
+        movq    %rcx, G+16(%rsp)
+        movq    %rdx, G+24(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, G+32(%rsp)
+
+// Also maintain reduced < 2^256 vector [u,v] such that
+// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_sm2)
+// starting with [p_sm2,x] == x * 2^{5*0-50} * [0,2^50] (mod p_sm2)
+// The weird-looking 5*i modifications come in because we are doing
+// 64-bit word-sized Montgomery reductions at each stage, which is
+// 5 bits more than the 59-bit requirement to keep things stable.
+
+        xorl    %eax, %eax
+        movq    %rax, U(%rsp)
+        movq    %rax, U+8(%rsp)
+        movq    %rax, U+16(%rsp)
+        movq    %rax, U+24(%rsp)
+
+        movq    $0x0004000000000000, %rcx
+        movq    %rcx, V(%rsp)
+        movq    %rax, V+8(%rsp)
+        movq    %rax, V+16(%rsp)
+        movq    %rax, V+24(%rsp)
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special tenth iteration after a uniform
+// first 9.
+
+        movq    $10, i
+        movq    $1, d
+        jmp     bignum_inv_sm2_midloop
+
+bignum_inv_sm2_loop:
+
+// Separate out the matrix into sign-magnitude pairs
+
+        movq    %r8, %r9
+        sarq    $63, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+
+        movq    %r10, %r11
+        sarq    $63, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+
+        movq    %r12, %r13
+        sarq    $63, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+
+        movq    %r14, %r15
+        sarq    $63, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in temporary storage for the [u,v] part and do [f,g] first.
+
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %rdi
+        andq    %r11, %rdi
+        addq    %rax, %rdi
+        movq    %rdi, tmp
+
+        movq    %r12, %rax
+        andq    %r13, %rax
+        movq    %r14, %rsi
+        andq    %r15, %rsi
+        addq    %rax, %rsi
+        movq    %rsi, tmp2
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        xorl    %ebx, %ebx
+        movq    F(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    G(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+
+        xorl    %ebp, %ebp
+        movq    F(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    G(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+
+// Digit 1 of [f,g]
+
+        xorl    %ecx, %ecx
+        movq    F+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    G+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        shrdq   $59, %rbx, %rdi
+        movq    %rdi, F(%rsp)
+
+        xorl    %edi, %edi
+        movq    F+N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        movq    G+N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        shrdq   $59, %rbp, %rsi
+        movq    %rsi, G(%rsp)
+
+// Digit 2 of [f,g]
+
+        xorl    %esi, %esi
+        movq    F+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        movq    G+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        shrdq   $59, %rcx, %rbx
+        movq    %rbx, F+N(%rsp)
+
+        xorl    %ebx, %ebx
+        movq    F+2*N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    G+2*N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        shrdq   $59, %rdi, %rbp
+        movq    %rbp, G+N(%rsp)
+
+// Digits 3 and 4 of [f,g]
+
+        movq    F+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        movq    F+4*N(%rsp), %rbp
+        xorq    %r9, %rbp
+        andq    %r8, %rbp
+        negq    %rbp
+        mulq    %r8
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    G+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    G+4*N(%rsp), %rdx
+        xorq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbp
+        mulq    %r10
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        shrdq   $59, %rsi, %rcx
+        movq    %rcx, F+2*N(%rsp)
+        shrdq   $59, %rbp, %rsi
+        sarq    $59, %rbp
+
+        movq    F+3*N(%rsp), %rax
+        movq    %rsi, F+3*N(%rsp)
+
+        movq    F+4*N(%rsp), %rsi
+        movq    %rbp, F+4*N(%rsp)
+
+        xorq    %r13, %rax
+        xorq    %r13, %rsi
+        andq    %r12, %rsi
+        negq    %rsi
+        mulq    %r12
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        movq    G+3*N(%rsp), %rax
+        xorq    %r15, %rax
+        movq    G+4*N(%rsp), %rdx
+        xorq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rsi
+        mulq    %r14
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        shrdq   $59, %rbx, %rdi
+        movq    %rdi, G+2*N(%rsp)
+        shrdq   $59, %rsi, %rbx
+        movq    %rbx, G+3*N(%rsp)
+        sarq    $59, %rsi
+        movq    %rsi, G+4*N(%rsp)
+
+// Get the initial carries back from storage and do the [u,v] accumulation
+
+        movq    tmp, %rbx
+        movq    tmp2, %rbp
+
+// Digit 0 of [u,v]
+
+        xorl    %ecx, %ecx
+        movq    U(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    V(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+
+        xorl    %esi, %esi
+        movq    U(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, U(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    V(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, V(%rsp)
+
+// Digit 1 of [u,v]
+
+        xorl    %ebx, %ebx
+        movq    U+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    V+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+
+        xorl    %ebp, %ebp
+        movq    U+N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rcx, U+N(%rsp)
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    V+N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    %rsi, V+N(%rsp)
+
+// Digit 2 of [u,v]
+
+        xorl    %ecx, %ecx
+        movq    U+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    V+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+
+        xorl    %esi, %esi
+        movq    U+2*N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, U+2*N(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    V+2*N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, V+2*N(%rsp)
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        movq    U+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        movq    %r9, %rbx
+        andq    %r8, %rbx
+        negq    %rbx
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    V+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbx
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rbx, %rdx
+
+// Preload for last use of old u digit 3
+
+        movq    U+3*N(%rsp), %rax
+        movq    %rcx, U+3*N(%rsp)
+        movq    %rdx, U+4*N(%rsp)
+
+// Digits 3 and 4 of v (top is unsigned)
+
+        xorq    %r13, %rax
+        movq    %r13, %rcx
+        andq    %r12, %rcx
+        negq    %rcx
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rcx
+        movq    V+3*N(%rsp), %rax
+        xorq    %r15, %rax
+        movq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rcx
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rcx, %rdx
+        movq    %rsi, V+3*N(%rsp)
+        movq    %rdx, V+4*N(%rsp)
+
+// Montgomery reduction of u
+
+        amontred(u)
+
+// Montgomery reduction of v
+
+        amontred(v)
+
+bignum_inv_sm2_midloop:
+
+        divstep59(d,ff,gg)
+        movq    %rsi, d
+
+// Next iteration
+
+        decq    i
+        jnz     bignum_inv_sm2_loop
+
+// The 10th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        movq    F(%rsp), %rax
+        movq    G(%rsp), %rcx
+        imulq   %r8, %rax
+        imulq   %r10, %rcx
+        addq    %rcx, %rax
+        sarq    $63, %rax
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * [u,v] (mod p_sm2)
+// we want to flip the sign of u according to that of f.
+
+        movq    %r8, %r9
+        sarq    $63, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        xorq    %rax, %r9
+
+        movq    %r10, %r11
+        sarq    $63, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        xorq    %rax, %r11
+
+        movq    %r12, %r13
+        sarq    $63, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        xorq    %rax, %r13
+
+        movq    %r14, %r15
+        sarq    $63, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        xorq    %rax, %r15
+
+// Adjust the initial value to allow for complement instead of negation
+
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %r12
+        andq    %r11, %r12
+        addq    %rax, %r12
+
+// Digit 0 of [u]
+
+        xorl    %r13d, %r13d
+        movq    U(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        movq    V(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+
+// Digit 1 of [u]
+
+        xorl    %r14d, %r14d
+        movq    U+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        movq    V+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+
+// Digit 2 of [u]
+
+        xorl    %r15d, %r15d
+        movq    U+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    V+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        movq    U+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        andq    %r8, %r9
+        negq    %r9
+        mulq    %r8
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    V+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %r9
+        mulq    %r10
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+
+// Store back and Montgomery reduce u
+
+        movq    %r12, U(%rsp)
+        movq    %r13, U+N(%rsp)
+        movq    %r14, U+2*N(%rsp)
+        movq    %r15, U+3*N(%rsp)
+        movq    %r9, U+4*N(%rsp)
+
+        amontred(u)
+
+// Perform final strict reduction mod p_sm2 and copy to output
+
+        movq    U(%rsp), %r8
+        movq    U+N(%rsp), %r9
+        movq    U+2*N(%rsp), %r10
+        movq    U+3*N(%rsp), %r11
+
+        movl    $1, %eax
+        movl    $0x00000000ffffffff, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        bts     $32, %rdx
+
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+
+        cmovncq %r8, %rax
+        cmovncq %r9, %rbx
+        cmovncq %r10, %rcx
+        cmovncq %r11, %rdx
+
+        movq    res, %rdi
+        movq    %rax, (%rdi)
+        movq    %rbx, N(%rdi)
+        movq    %rcx, 2*N(%rdi)
+        movq    %rdx, 3*N(%rdi)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2.S
new file mode 100644
index 00000000000..52ed311d316
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2.S
@@ -0,0 +1,203 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_nsm2
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2.
+//
+// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = k, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2)
+        .text
+
+#define z %rdi
+#define k %rsi
+#define x %rcx
+
+#define m0 %r8
+#define m1 %r9
+#define m2 %r10
+#define m3 %r11
+#define d %r12
+
+#define n0 %rax
+#define n1 %rbx
+#define n3 %rdx
+#define q %rdx
+
+#define qshort %edx
+
+S2N_BN_SYMBOL(bignum_mod_nsm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save extra registers
+
+        pushq   %rbx
+        pushq   %r12
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmpq    $4, k
+        jc      bignum_mod_nsm2_shortinput
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        subq    $4, k
+        movq    24(%rdx,k,8), m3
+        movq    16(%rdx,k,8), m2
+        movq    8(%rdx,k,8), m1
+        movq    (%rdx,k,8), m0
+
+// Move x into another register to leave %rdx free for multiplies and use of n3
+
+        movq    %rdx, x
+
+// Reduce the top 4 digits mod n_sm2 (a conditional subtraction of n_sm2)
+
+        movq    $0xac440bf6c62abedd, n0
+        movq    $0x8dfc2094de39fad4, n1
+        movq    $0x0000000100000000, n3
+
+        addq    n0, m0
+        adcq    n1, m1
+        adcq    $0, m2
+        adcq    n3, m3
+        sbbq    d, d
+        notq    d
+        andq    d, n0
+        andq    d, n1
+        andq    d, n3
+        subq    n0, m0
+        sbbq    n1, m1
+        sbbq    $0, m2
+        sbbq    n3, m3
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        testq   k, k
+        jz      bignum_mod_nsm2_writeback
+
+bignum_mod_nsm2_loop:
+
+// Writing the input, with the new zeroth digit implicitly appended, as
+// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is
+// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1)
+
+        movq    m2, d
+        movl    $1, qshort
+        addq    m3, d
+        adcq    m3, q
+
+        shrq    $32, d
+        addq    m3, d
+
+        shrq    $32, d
+        addq    d, q
+        sbbq    $0, q
+
+// Load the next digit so current m to reduce = [m3;m2;m1;m0;d]
+
+        movq    -8(x,k,8), d
+
+// Now form [m3;m2;m1;m0;d] = m - q * n_sm2
+
+        subq    q, m3
+        movq    $0xac440bf6c62abedd, n0
+        mulxq   n0, n0, n1
+        addq    n0, d
+        adcq    n1, m0
+        movq    $0x8dfc2094de39fad4, n0
+        mulxq   n0, n0, n1
+        adcq    $0, n1
+        addq    n0, m0
+        adcq    n1, m1
+        movq    $0x0000000100000000, n0
+        mulxq   n0, n0, n1
+        adcq    n0, m2
+        adcq    n1, m3
+
+// Now our top word m3 is either zero or all 1s. Use it for a masked
+// addition of n_sm2, which we can do by a *subtraction* of
+// 2^256 - n_sm2 from our portion
+
+        movq    $0xac440bf6c62abedd, n0
+        andq    m3, n0
+        movq    $0x8dfc2094de39fad4, n1
+        andq    m3, n1
+        movq    $0x0000000100000000, n3
+        andq    m3, n3
+
+        subq    n0, d
+        sbbq    n1, m0
+        sbbq    $0, m1
+        sbbq    n3, m2
+
+// Now shuffle registers up and loop
+
+        movq    m2, m3
+        movq    m1, m2
+        movq    m0, m1
+        movq    d, m0
+
+        decq    k
+        jnz     bignum_mod_nsm2_loop
+
+// Write back
+
+bignum_mod_nsm2_writeback:
+
+        movq    m0, (z)
+        movq    m1, 8(z)
+        movq    m2, 16(z)
+        movq    m3, 24(z)
+
+// Restore registers and return
+
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+bignum_mod_nsm2_shortinput:
+
+        xorq    m0, m0
+        xorq    m1, m1
+        xorq    m2, m2
+        xorq    m3, m3
+
+        testq   k, k
+        jz      bignum_mod_nsm2_writeback
+        movq    (%rdx), m0
+        decq    k
+        jz      bignum_mod_nsm2_writeback
+        movq    8(%rdx), m1
+        decq    k
+        jz      bignum_mod_nsm2_writeback
+        movq    16(%rdx), m2
+        jmp     bignum_mod_nsm2_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_4.S
new file mode 100644
index 00000000000..e749f1298d7
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_4.S
@@ -0,0 +1,96 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_nsm2_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2_4)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n0 %rax
+#define n1 %r10
+#define n3 %r11
+
+// Can re-use this as a temporary once we've loaded the input
+
+#define c %rsi
+
+S2N_BN_SYMBOL(bignum_mod_nsm2_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load a set of registers [n3; 0; n1; n0] = 2^256 - n_sm2
+
+        movq    $0xac440bf6c62abedd, n0
+        movq    $0x8dfc2094de39fad4, n1
+        movq    $0x0000000100000000, n3
+
+// Load the input and compute x + (2^256 - n_sm2)
+
+        movq    (x), d0
+        addq    n0, d0
+        movq    8(x), d1
+        adcq    n1, d1
+        movq    16(x), d2
+        adcq    $0, d2
+        movq    24(x), d3
+        adcq    n3, d3
+
+// Now CF is set iff 2^256 <= x + (2^256 - n_sm2), i.e. iff n_sm2 <= x.
+// Create a mask for the condition x < n, and mask the three nontrivial digits
+// ready to undo the previous addition with a compensating subtraction
+
+        sbbq    c, c
+        notq    c
+        andq    c, n0
+        andq    c, n1
+        andq    c, n3
+
+// Now subtract mask * (2^256 - n_sm2) again and store
+
+        subq    n0, d0
+        movq    d0, (z)
+        sbbq    n1, d1
+        movq    d1, 8(z)
+        sbbq    $0, d2
+        movq    d2, 16(z)
+        sbbq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_alt.S
new file mode 100644
index 00000000000..662f76459a1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_alt.S
@@ -0,0 +1,211 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo group order, z := x mod n_sm2
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_nsm2_alt
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2.
+//
+// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = k, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2_alt)
+        .text
+
+#define z %rdi
+#define k %rsi
+#define x %rcx
+
+#define m0 %r8
+#define m1 %r9
+#define m2 %r10
+#define m3 %r11
+#define d %r12
+
+#define n0 %rax
+#define n1 %rbx
+#define n3 %rdx
+
+#define q %rbx
+
+#define qshort %ebx
+
+S2N_BN_SYMBOL(bignum_mod_nsm2_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save extra registers
+
+        pushq   %rbx
+        pushq   %r12
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmpq    $4, k
+        jc      bignum_mod_nsm2_alt_shortinput
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        subq    $4, k
+        movq    24(%rdx,k,8), m3
+        movq    16(%rdx,k,8), m2
+        movq    8(%rdx,k,8), m1
+        movq    (%rdx,k,8), m0
+
+// Move x into another register to leave %rdx free for multiplies and use of n3
+
+        movq    %rdx, x
+
+// Reduce the top 4 digits mod n_sm2 (a conditional subtraction of n_sm2)
+
+        movq    $0xac440bf6c62abedd, n0
+        movq    $0x8dfc2094de39fad4, n1
+        movq    $0x0000000100000000, n3
+
+        addq    n0, m0
+        adcq    n1, m1
+        adcq    $0, m2
+        adcq    n3, m3
+        sbbq    d, d
+        notq    d
+        andq    d, n0
+        andq    d, n1
+        andq    d, n3
+        subq    n0, m0
+        sbbq    n1, m1
+        sbbq    $0, m2
+        sbbq    n3, m3
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        testq   k, k
+        jz      bignum_mod_nsm2_alt_writeback
+
+bignum_mod_nsm2_alt_loop:
+
+// Writing the input, with the new zeroth digit implicitly appended, as
+// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is
+// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1)
+
+        movq    m2, d
+        movl    $1, qshort
+        addq    m3, d
+        adcq    m3, q
+
+        shrq    $32, d
+        addq    m3, d
+
+        shrq    $32, d
+        addq    d, q
+        sbbq    $0, q
+
+// Load the next digit so current m to reduce = [m3;m2;m1;m0;d]
+
+        movq    -8(x,k,8), d
+
+// Now form [m3;m2;m1;m0;d] = m - q * n_sm2
+
+        subq    q, m3
+
+        movq    $0xac440bf6c62abedd, %rax
+        mulq    q
+        addq    %rax, d
+        adcq    %rdx, m0
+        adcq    $0, m1
+        adcq    $0, m2
+        adcq    $0, m3
+
+        movq    $0x8dfc2094de39fad4, %rax
+        mulq    q
+        addq    %rax, m0
+        adcq    %rdx, m1
+        adcq    $0, m2
+        adcq    $0, m3
+
+        movq    $0x0000000100000000, %rax
+        mulq    q
+        addq    %rax, m2
+        adcq    %rdx, m3
+
+// Now our top word m3 is either zero or all 1s. Use it for a masked
+// addition of n_sm2, which we can do by a *subtraction* of
+// 2^256 - n_sm2 from our portion
+
+        movq    $0xac440bf6c62abedd, n0
+        andq    m3, n0
+        movq    $0x8dfc2094de39fad4, n1
+        andq    m3, n1
+        movq    $0x0000000100000000, n3
+        andq    m3, n3
+
+        subq    n0, d
+        sbbq    n1, m0
+        sbbq    $0, m1
+        sbbq    n3, m2
+
+// Now shuffle registers up and loop
+
+        movq    m2, m3
+        movq    m1, m2
+        movq    m0, m1
+        movq    d, m0
+
+        decq    k
+        jnz     bignum_mod_nsm2_alt_loop
+
+// Write back
+
+bignum_mod_nsm2_alt_writeback:
+
+        movq    m0, (z)
+        movq    m1, 8(z)
+        movq    m2, 16(z)
+        movq    m3, 24(z)
+
+// Restore registers and return
+
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+bignum_mod_nsm2_alt_shortinput:
+
+        xorq    m0, m0
+        xorq    m1, m1
+        xorq    m2, m2
+        xorq    m3, m3
+
+        testq   k, k
+        jz      bignum_mod_nsm2_alt_writeback
+        movq    (%rdx), m0
+        decq    k
+        jz      bignum_mod_nsm2_alt_writeback
+        movq    8(%rdx), m1
+        decq    k
+        jz      bignum_mod_nsm2_alt_writeback
+        movq    16(%rdx), m2
+        jmp     bignum_mod_nsm2_alt_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2.S
new file mode 100644
index 00000000000..27324a2a7de
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2.S
@@ -0,0 +1,198 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[k]; output z[4]
+//
+//    extern void bignum_mod_sm2
+//     (uint64_t z[static 4], uint64_t k, uint64_t *x);
+//
+// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = k, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_sm2)
+        .text
+
+#define z %rdi
+#define k %rsi
+#define x %rdx
+
+#define m0 %r8
+#define m1 %r9
+#define m2 %r10
+#define m3 %r11
+#define d %r12
+
+#define n0 %rax
+#define n1 %rbx
+#define n3 %rcx
+#define q %rcx
+
+#define qshort %ecx
+
+S2N_BN_SYMBOL(bignum_mod_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save extra registers
+
+        pushq   %rbx
+        pushq   %r12
+
+// If the input is already <= 3 words long, go to a trivial "copy" path
+
+        cmpq    $4, k
+        jc      bignum_mod_sm2_shortinput
+
+// Otherwise load the top 4 digits (top-down) and reduce k by 4
+
+        subq    $4, k
+        movq    24(x,k,8), m3
+        movq    16(x,k,8), m2
+        movq    8(x,k,8), m1
+        movq    (x,k,8), m0
+
+// Load non-trivial digits [n3; -1; n1; -1] = p_sm2 and do a conditional
+// subtraction to reduce the four starting digits [m3;m2;m1;m0] modulo p_sm2
+
+        subq    $-1, m0
+        movq    $0xffffffff00000000, n1
+        sbbq    n1, m1
+        movq    $0xfffffffeffffffff, n3
+        sbbq    $-1, m2
+        sbbq    n3, m3
+
+        sbbq    n0, n0
+
+        andq    n0, n1
+        andq    n0, n3
+        addq    n0, m0
+        adcq    n1, m1
+        adcq    n0, m2
+        adcq    n3, m3
+
+// Now do (k-4) iterations of 5->4 word modular reduction
+
+        testq   k, k
+        jz      bignum_mod_sm2_writeback
+
+bignum_mod_sm2_loop:
+
+// Writing the input, with the new zeroth digit implicitly appended, as
+// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is
+// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1)
+
+        movq    m2, d
+        movl    $1, qshort
+        addq    m3, d
+        adcq    m3, q
+
+        shrq    $32, d
+        addq    m3, d
+
+        shrq    $32, d
+        addq    d, q
+        sbbq    $0, q
+
+// Load the next digit so current m to reduce = [m3;m2;m1;m0;d]
+
+        movq    -8(x,k,8), d
+
+// Now compute the initial pre-reduced [m3;m2;m1;m0;d] = m - p_sm2 * q
+// = z - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q
+
+        movq    q, n0
+        movq    q, n1
+        shlq    $32, n0
+        shrq    $32, n1
+
+        addq    n0, m2
+        adcq    n1, m3
+
+        subq    q, n0
+        sbbq    $0, n1
+
+        subq    q, m3
+
+        addq    q, d
+        adcq    n0, m0
+        adcq    n1, m1
+        adcq    $0, m2
+        adcq    $0, m3
+
+// Now our top word m3 is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative, as a bitmask
+// Do a masked addition of p_sm2
+
+        movq    $0xffffffff00000000, n1
+        andq    m3, n1
+        movq    $0xfffffffeffffffff, n3
+        andq    m3, n3
+        addq    m3, d
+        adcq    n1, m0
+        adcq    m3, m1
+        adcq    n3, m2
+
+// Shuffle registers up and loop
+
+        movq    m2, m3
+        movq    m1, m2
+        movq    m0, m1
+        movq    d, m0
+
+        decq    k
+        jnz     bignum_mod_sm2_loop
+
+// Write back
+
+bignum_mod_sm2_writeback:
+
+        movq    m0, (z)
+        movq    m1, 8(z)
+        movq    m2, 16(z)
+        movq    m3, 24(z)
+
+// Restore registers and return
+
+        popq    %r12
+        popq    %rbx
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+bignum_mod_sm2_shortinput:
+
+        xorq    m0, m0
+        xorq    m1, m1
+        xorq    m2, m2
+        xorq    m3, m3
+
+        testq   k, k
+        jz      bignum_mod_sm2_writeback
+        movq    (%rdx), m0
+        decq    k
+        jz      bignum_mod_sm2_writeback
+        movq    8(%rdx), m1
+        decq    k
+        jz      bignum_mod_sm2_writeback
+        movq    16(%rdx), m2
+        jmp     bignum_mod_sm2_writeback
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2_4.S
new file mode 100644
index 00000000000..314a230c587
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2_4.S
@@ -0,0 +1,84 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Reduce modulo field characteristic, z := x mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_mod_sm2_4
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_sm2_4)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_sm2_4)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define d0 %rdx
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %r11
+#define c %rax
+
+S2N_BN_SYMBOL(bignum_mod_sm2_4):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the input and subtract to get [d3;d3;d1;d1] = x - p_sm2 (modulo 2^256)
+// The constants n1 and n3 in [n3; -1; n1; -1] = p_sm2 are saved for later
+
+        movq    (x), d0
+        subq    $-1, d0
+        movq    8(x), d1
+        movq    $0xffffffff00000000, n1
+        sbbq    n1, d1
+        movq    16(x), d2
+        sbbq    $-1, d2
+        movq    $0xfffffffeffffffff, n3
+        movq    24(x), d3
+        sbbq    n3, d3
+
+// Capture the carry to determine whether to add back p_sm2, and use
+// it to create a masked p_sm2' = [n3; c; n1; c]
+
+        sbbq    c, c
+        andq    c, n1
+        andq    c, n3
+
+// Do the corrective addition and copy to output
+
+        addq    c, d0
+        movq    d0, (z)
+        adcq    n1, d1
+        movq    d1, 8(z)
+        adcq    c, d2
+        movq    d2, 16(z)
+        adcq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montinv_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montinv_sm2.S
new file mode 100644
index 00000000000..0c0d2507726
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montinv_sm2.S
@@ -0,0 +1,1640 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1
+// Input x[4]; output z[4]
+//
+// extern void bignum_montinv_sm2(uint64_t z[static 4],uint64_t x[static 4]);
+//
+// If the 4-digit input x is coprime to p_sm2, i.e. is not divisible
+// by it, returns z < p_sm2 such that x * z == 2^512 (mod p_sm2). This
+// is effectively "Montgomery inverse" because if we consider x and z as
+// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z
+// (both mod p_sm2) then X * Z == 1 (mod p_sm2). That is, this function
+// gives the analog of the modular inverse bignum_inv_sm2 but with both
+// input and output in the Montgomery domain. Note that x does not need
+// to be reduced modulo p_sm2, but the output always is. If the input
+// is divisible (i.e. is 0 or p_sm2), then there can be no solution to
+// the congruence x * z == 2^512 (mod p_sm2), and z = 0 is returned.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_sm2)
+        .text
+
+// Size in bytes of a 64-bit word
+
+#define N 8
+
+// Pointer-offset pairs for temporaries on stack
+
+#define f 0(%rsp)
+#define g (5*N)(%rsp)
+#define u (10*N)(%rsp)
+#define v (15*N)(%rsp)
+#define tmp  (20*N)(%rsp)
+#define tmp2  (21*N)(%rsp)
+#define i  (22*N)(%rsp)
+#define d  (23*N)(%rsp)
+
+#define mat (24*N)(%rsp)
+
+// Backup for the input pointer
+
+#define res  (28*N)(%rsp)
+
+// Total size to reserve on the stack
+
+#define NSPACE (30*N)
+
+// Syntactic variants to make x86_att version simpler to generate
+
+#define F 0
+#define G (5*N)
+#define U (10*N)
+#define V (15*N)
+#define MAT (24*N)
+
+#define ff  (%rsp)
+#define gg  (5*N)(%rsp)
+
+// ---------------------------------------------------------------------------
+// Core signed almost-Montgomery reduction macro from u[4..0] to u[3..0].
+// ---------------------------------------------------------------------------
+
+#define amontred(P)                                                     \
+/* We only know the input is -2^316 < x < 2^316. To do traditional  */  \
+/* unsigned Montgomery reduction, start by adding 2^61 * p_sm2.     */  \
+        movq    $0xe000000000000000, %r8 ;                                 \
+        addq    P, %r8 ;                                                \
+        movq    $0x1fffffffffffffff, %r9 ;                                 \
+        adcq    8+P, %r9 ;                                              \
+        movq    $0xffffffffe0000000, %r10 ;                                \
+        adcq    16+P, %r10 ;                                            \
+        movq    $0xffffffffffffffff, %r11 ;                                \
+        adcq    24+P, %r11 ;                                            \
+        movq    $0x1fffffffdfffffff, %r12 ;                                \
+        adcq    32+P, %r12 ;                                            \
+/* Let [%rcx;%rbx] = 2^32 * d0 and [%rdx;%rax] = (2^32-1) * d0 */           \
+        movq    %r8, %rbx ;                                                \
+        movq    %r8, %rcx ;                                                \
+        shrq    $32, %rcx ;                                                \
+        shlq    $32, %rbx ;                                                \
+        movl    $0xffffffff, %eax ;                                        \
+        mulq    %r8;                                                     \
+/* Now [%r12;%r11;%r10;%r9] := [%r8;%r11;%r10;%r9] - [%rcx;%rbx;%rdx;%rax] */       \
+        subq    %rax, %r9 ;                                                \
+        sbbq    %rdx, %r10 ;                                               \
+        sbbq    %rbx, %r11 ;                                               \
+        sbbq    %rcx, %r8 ;                                                \
+        addq    %r8, %r12 ;                                                \
+/* Now capture carry and subtract p_sm2 if set (almost-Montgomery) */   \
+        sbbq    %rax, %rax ;                                               \
+        movl    $0xffffffff, %ebx ;                                        \
+        notq    %rbx;                                                    \
+        andq    %rax, %rbx ;                                               \
+        movq    %rax, %rdx ;                                               \
+        btr     $32, %rdx ;                                                \
+        subq    %rax, %r9 ;                                                \
+        movq    %r9, P ;                                                \
+        sbbq    %rbx, %r10 ;                                               \
+        movq    %r10, 8+P ;                                             \
+        sbbq    %rax, %r11 ;                                               \
+        movq    %r11, 16+P ;                                            \
+        sbbq    %rdx, %r12 ;                                               \
+        movq    %r12, 24+P
+
+// Very similar to a subroutine call to the s2n-bignum word_divstep59.
+// But different in register usage and returning the final matrix as
+//
+// [ %r8   %r10]
+// [ %r12  %r14]
+//
+// and also returning the matrix still negated (which doesn't matter)
+
+#define divstep59(din,fin,gin)                                          \
+        movq    din, %rsi ;                                               \
+        movq    fin, %rdx ;                                               \
+        movq    gin, %rcx ;                                               \
+        movq    %rdx, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        xorl    %ebp, %ebp ;                                               \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %rdx ;                                         \
+        leaq    (%rcx,%rax), %rdi ;                                         \
+        shlq    $0x16, %rdx ;                                              \
+        shlq    $0x16, %rdi ;                                              \
+        sarq    $0x2b, %rdx ;                                              \
+        sarq    $0x2b, %rdi ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %rbx ;                                         \
+        leaq    (%rcx,%rax), %rcx ;                                         \
+        sarq    $0x2a, %rbx ;                                              \
+        sarq    $0x2a, %rcx ;                                              \
+        movq    %rdx, MAT(%rsp) ;                                         \
+        movq    %rbx, MAT+0x8(%rsp) ;                                     \
+        movq    %rdi, MAT+0x10(%rsp) ;                                    \
+        movq    %rcx, MAT+0x18(%rsp) ;                                    \
+        movq    fin, %r12 ;                                               \
+        imulq   %r12, %rdi ;                                               \
+        imulq   %rdx, %r12 ;                                               \
+        movq    gin, %r13 ;                                               \
+        imulq   %r13, %rbx ;                                               \
+        imulq   %rcx, %r13 ;                                               \
+        addq    %rbx, %r12 ;                                               \
+        addq    %rdi, %r13 ;                                               \
+        sarq    $0x14, %r12 ;                                              \
+        sarq    $0x14, %r13 ;                                              \
+        movq    %r12, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        movq    %r13, %rcx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %r8 ;                                          \
+        leaq    (%rcx,%rax), %r10 ;                                         \
+        shlq    $0x16, %r8 ;                                               \
+        shlq    $0x16, %r10 ;                                              \
+        sarq    $0x2b, %r8 ;                                               \
+        sarq    $0x2b, %r10 ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %r15 ;                                         \
+        leaq    (%rcx,%rax), %r11 ;                                         \
+        sarq    $0x2a, %r15 ;                                              \
+        sarq    $0x2a, %r11 ;                                              \
+        movq    %r13, %rbx ;                                               \
+        movq    %r12, %rcx ;                                               \
+        imulq   %r8, %r12 ;                                                \
+        imulq   %r15, %rbx ;                                               \
+        addq    %rbx, %r12 ;                                               \
+        imulq   %r11, %r13 ;                                               \
+        imulq   %r10, %rcx ;                                               \
+        addq    %rcx, %r13 ;                                               \
+        sarq    $0x14, %r12 ;                                              \
+        sarq    $0x14, %r13 ;                                              \
+        movq    %r12, %rbx ;                                               \
+        andq    $0xfffff, %rbx ;                                           \
+        movabsq $0xfffffe0000000000, %rax ;                                \
+        orq     %rax, %rbx ;                                               \
+        movq    %r13, %rcx ;                                               \
+        andq    $0xfffff, %rcx ;                                           \
+        movabsq $0xc000000000000000, %rax ;                                \
+        orq     %rax, %rcx ;                                               \
+        movq    MAT(%rsp), %rax ;                                         \
+        imulq   %r8, %rax ;                                                \
+        movq    MAT+0x10(%rsp), %rdx ;                                    \
+        imulq   %r15, %rdx ;                                               \
+        imulq   MAT+0x8(%rsp), %r8 ;                                      \
+        imulq   MAT+0x18(%rsp), %r15 ;                                    \
+        addq    %r8, %r15 ;                                                \
+        leaq    (%rax,%rdx), %r9 ;                                          \
+        movq    MAT(%rsp), %rax ;                                         \
+        imulq   %r10, %rax ;                                               \
+        movq    MAT+0x10(%rsp), %rdx ;                                    \
+        imulq   %r11, %rdx ;                                               \
+        imulq   MAT+0x8(%rsp), %r10 ;                                     \
+        imulq   MAT+0x18(%rsp), %r11 ;                                    \
+        addq    %r10, %r11 ;                                               \
+        leaq    (%rax,%rdx), %r13 ;                                         \
+        movq    $0xfffffffffffffffe, %rax ;                                \
+        movl    $0x2, %edx ;                                               \
+        movq    %rbx, %rdi ;                                               \
+        movq    %rax, %r8 ;                                                \
+        testq   %rsi, %rsi ;                                               \
+        cmovs   %rbp, %r8 ;                                                \
+        testq   $0x1, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        cmovs   %rbp, %r8 ;                                                \
+        movq    %rbx, %rdi ;                                               \
+        testq   %rdx, %rcx ;                                               \
+        cmoveq  %rbp, %r8 ;                                                \
+        cmoveq  %rbp, %rdi ;                                               \
+        sarq    $1, %rcx ;                                                 \
+        xorq    %r8, %rdi ;                                                \
+        xorq    %r8, %rsi ;                                                \
+        btq     $0x3f, %r8 ;                                               \
+        cmovbq  %rcx, %rbx ;                                               \
+        movq    %rax, %r8 ;                                                \
+        subq    %rax, %rsi ;                                               \
+        leaq    (%rcx,%rdi), %rcx ;                                         \
+        sarq    $1, %rcx ;                                                 \
+        movl    $0x100000, %eax ;                                          \
+        leaq    (%rbx,%rax), %r8 ;                                          \
+        leaq    (%rcx,%rax), %r12 ;                                         \
+        shlq    $0x15, %r8 ;                                               \
+        shlq    $0x15, %r12 ;                                              \
+        sarq    $0x2b, %r8 ;                                               \
+        sarq    $0x2b, %r12 ;                                              \
+        movabsq $0x20000100000, %rax ;                                     \
+        leaq    (%rbx,%rax), %r10 ;                                         \
+        leaq    (%rcx,%rax), %r14 ;                                         \
+        sarq    $0x2b, %r10 ;                                              \
+        sarq    $0x2b, %r14 ;                                              \
+        movq    %r9, %rax ;                                                \
+        imulq   %r8, %rax ;                                                \
+        movq    %r13, %rdx ;                                               \
+        imulq   %r10, %rdx ;                                               \
+        imulq   %r15, %r8 ;                                                \
+        imulq   %r11, %r10 ;                                               \
+        addq    %r8, %r10 ;                                                \
+        leaq    (%rax,%rdx), %r8 ;                                          \
+        movq    %r9, %rax ;                                                \
+        imulq   %r12, %rax ;                                               \
+        movq    %r13, %rdx ;                                               \
+        imulq   %r14, %rdx ;                                               \
+        imulq   %r15, %r12 ;                                               \
+        imulq   %r11, %r14 ;                                               \
+        addq    %r12, %r14 ;                                               \
+        leaq    (%rax,%rdx), %r12
+
+S2N_BN_SYMBOL(bignum_montinv_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room for temporaries
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+        subq    $NSPACE, %rsp
+
+// Save the return pointer for the end so we can overwrite %rdi later
+
+        movq    %rdi, res
+
+// Create constant [%rdx;%rcx;%rbx;%rax] = p_sm2 and copy it into the variable f
+// including the 5th zero digit
+
+        xorl    %ebp, %ebp
+        leaq    -1(%rbp), %rax
+        movl    $0x00000000ffffffff, %ebx
+        notq    %rbx
+        movq    %rax, %rcx
+        movq    %rax, %rdx
+        btr     $32, %rdx
+
+        movq    %rax, F(%rsp)
+        movq    %rbx, F+8(%rsp)
+        movq    %rcx, F+16(%rsp)
+        movq    %rdx, F+24(%rsp)
+        movq    %rbp, F+32(%rsp)
+
+// Now reduce the input modulo p_sm2, first negating the constant to get
+// [%rdx;%rcx;%rbx;%rax] = 2^256 - p_sm2, adding it to x and hence getting
+// the comparison x < p_sm2 <=> (2^256 - p_sm2) + x < 2^256 and choosing
+// g accordingly.
+
+        movq    (%rsi), %r8
+        movq    8(%rsi), %r9
+        movq    16(%rsi), %r10
+        movq    24(%rsi), %r11
+
+        movl    $1, %eax
+        notq    %rbx
+        xorl    %ecx, %ecx
+        notq    %rdx
+
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+
+        cmovncq %r8, %rax
+        cmovncq %r9, %rbx
+        cmovncq %r10, %rcx
+        cmovncq %r11, %rdx
+
+        movq    %rax, G(%rsp)
+        movq    %rbx, G+8(%rsp)
+        movq    %rcx, G+16(%rsp)
+        movq    %rdx, G+24(%rsp)
+        xorl    %eax, %eax
+        movq    %rax, G+32(%rsp)
+
+// Also maintain reduced < 2^256 vector [u,v] such that
+// [f,g] == x * 2^{5*i-562} * [u,v] (mod p_sm2)
+// starting with [p_sm2,x] == x * 2^{5*0-562} * [0,2^562] (mod p_sm2)
+// The weird-looking 5*i modifications come in because we are doing
+// 64-bit word-sized Montgomery reductions at each stage, which is
+// 5 bits more than the 59-bit requirement to keep things stable.
+// After the 10th and last iteration and sign adjustment, when
+// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e.
+// x * u == 2^512 as required.
+
+        xorl    %eax, %eax
+        movq    %rax, U(%rsp)
+        movq    %rax, U+8(%rsp)
+        movq    %rax, U+16(%rsp)
+        movq    %rax, U+24(%rsp)
+
+        movq    $0x000c000000100000, %rax
+        movq    %rax, V(%rsp)
+        movq    $0x000bfffffff80000, %rax
+        movq    %rax, V+8(%rsp)
+        movq    $0x00040000000c0000, %rax
+        movq    %rax, V+16(%rsp)
+        movq    $0x0018000000040000, %rax
+        movq    %rax, V+24(%rsp)
+
+// Start of main loop. We jump into the middle so that the divstep
+// portion is common to the special tenth iteration after a uniform
+// first 9.
+
+        movq    $10, i
+        movq    $1, d
+        jmp     bignum_montinv_sm2_midloop
+
+bignum_montinv_sm2_loop:
+
+// Separate out the matrix into sign-magnitude pairs
+
+        movq    %r8, %r9
+        sarq    $63, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+
+        movq    %r10, %r11
+        sarq    $63, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+
+        movq    %r12, %r13
+        sarq    $63, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+
+        movq    %r14, %r15
+        sarq    $63, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+
+// Adjust the initial values to allow for complement instead of negation
+// This initial offset is the same for [f,g] and [u,v] compositions.
+// Save it in temporary storage for the [u,v] part and do [f,g] first.
+
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %rdi
+        andq    %r11, %rdi
+        addq    %rax, %rdi
+        movq    %rdi, tmp
+
+        movq    %r12, %rax
+        andq    %r13, %rax
+        movq    %r14, %rsi
+        andq    %r15, %rsi
+        addq    %rax, %rsi
+        movq    %rsi, tmp2
+
+// Now the computation of the updated f and g values. This maintains a
+// 2-word carry between stages so we can conveniently insert the shift
+// right by 59 before storing back, and not overwrite digits we need
+// again of the old f and g values.
+//
+// Digit 0 of [f,g]
+
+        xorl    %ebx, %ebx
+        movq    F(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    G(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+
+        xorl    %ebp, %ebp
+        movq    F(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    G(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+
+// Digit 1 of [f,g]
+
+        xorl    %ecx, %ecx
+        movq    F+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    G+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        shrdq   $59, %rbx, %rdi
+        movq    %rdi, F(%rsp)
+
+        xorl    %edi, %edi
+        movq    F+N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        movq    G+N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rdi
+        shrdq   $59, %rbp, %rsi
+        movq    %rsi, G(%rsp)
+
+// Digit 2 of [f,g]
+
+        xorl    %esi, %esi
+        movq    F+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        movq    G+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rsi
+        shrdq   $59, %rcx, %rbx
+        movq    %rbx, F+N(%rsp)
+
+        xorl    %ebx, %ebx
+        movq    F+2*N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        movq    G+2*N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rdi
+        adcq    %rdx, %rbx
+        shrdq   $59, %rdi, %rbp
+        movq    %rbp, G+N(%rsp)
+
+// Digits 3 and 4 of [f,g]
+
+        movq    F+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        movq    F+4*N(%rsp), %rbp
+        xorq    %r9, %rbp
+        andq    %r8, %rbp
+        negq    %rbp
+        mulq    %r8
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    G+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    G+4*N(%rsp), %rdx
+        xorq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbp
+        mulq    %r10
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        shrdq   $59, %rsi, %rcx
+        movq    %rcx, F+2*N(%rsp)
+        shrdq   $59, %rbp, %rsi
+        sarq    $59, %rbp
+
+        movq    F+3*N(%rsp), %rax
+        movq    %rsi, F+3*N(%rsp)
+
+        movq    F+4*N(%rsp), %rsi
+        movq    %rbp, F+4*N(%rsp)
+
+        xorq    %r13, %rax
+        xorq    %r13, %rsi
+        andq    %r12, %rsi
+        negq    %rsi
+        mulq    %r12
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        movq    G+3*N(%rsp), %rax
+        xorq    %r15, %rax
+        movq    G+4*N(%rsp), %rdx
+        xorq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rsi
+        mulq    %r14
+        addq    %rax, %rbx
+        adcq    %rdx, %rsi
+        shrdq   $59, %rbx, %rdi
+        movq    %rdi, G+2*N(%rsp)
+        shrdq   $59, %rsi, %rbx
+        movq    %rbx, G+3*N(%rsp)
+        sarq    $59, %rsi
+        movq    %rsi, G+4*N(%rsp)
+
+// Get the initial carries back from storage and do the [u,v] accumulation
+
+        movq    tmp, %rbx
+        movq    tmp2, %rbp
+
+// Digit 0 of [u,v]
+
+        xorl    %ecx, %ecx
+        movq    U(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    V(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+
+        xorl    %esi, %esi
+        movq    U(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, U(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    V(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, V(%rsp)
+
+// Digit 1 of [u,v]
+
+        xorl    %ebx, %ebx
+        movq    U+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    V+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+
+        xorl    %ebp, %ebp
+        movq    U+N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rcx, U+N(%rsp)
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    V+N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rdx, %rbp
+        movq    %rsi, V+N(%rsp)
+
+// Digit 2 of [u,v]
+
+        xorl    %ecx, %ecx
+        movq    U+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+        movq    V+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %rbx
+        adcq    %rdx, %rcx
+
+        xorl    %esi, %esi
+        movq    U+2*N(%rsp), %rax
+        xorq    %r13, %rax
+        mulq    %r12
+        movq    %rbx, U+2*N(%rsp)
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    V+2*N(%rsp), %rax
+        xorq    %r15, %rax
+        mulq    %r14
+        addq    %rax, %rbp
+        adcq    %rdx, %rsi
+        movq    %rbp, V+2*N(%rsp)
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        movq    U+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        movq    %r9, %rbx
+        andq    %r8, %rbx
+        negq    %rbx
+        mulq    %r8
+        addq    %rax, %rcx
+        adcq    %rdx, %rbx
+        movq    V+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %rbx
+        mulq    %r10
+        addq    %rax, %rcx
+        adcq    %rbx, %rdx
+
+// Preload for last use of old u digit 3
+
+        movq    U+3*N(%rsp), %rax
+        movq    %rcx, U+3*N(%rsp)
+        movq    %rdx, U+4*N(%rsp)
+
+// Digits 3 and 4 of v (top is unsigned)
+
+        xorq    %r13, %rax
+        movq    %r13, %rcx
+        andq    %r12, %rcx
+        negq    %rcx
+        mulq    %r12
+        addq    %rax, %rsi
+        adcq    %rdx, %rcx
+        movq    V+3*N(%rsp), %rax
+        xorq    %r15, %rax
+        movq    %r15, %rdx
+        andq    %r14, %rdx
+        subq    %rdx, %rcx
+        mulq    %r14
+        addq    %rax, %rsi
+        adcq    %rcx, %rdx
+        movq    %rsi, V+3*N(%rsp)
+        movq    %rdx, V+4*N(%rsp)
+
+// Montgomery reduction of u
+
+        amontred(u)
+
+// Montgomery reduction of v
+
+        amontred(v)
+
+bignum_montinv_sm2_midloop:
+
+        divstep59(d,ff,gg)
+        movq    %rsi, d
+
+// Next iteration
+
+        decq    i
+        jnz     bignum_montinv_sm2_loop
+
+// The 10th and last iteration does not need anything except the
+// u value and the sign of f; the latter can be obtained from the
+// lowest word of f. So it's done differently from the main loop.
+// Find the sign of the new f. For this we just need one digit
+// since we know (for in-scope cases) that f is either +1 or -1.
+// We don't explicitly shift right by 59 either, but looking at
+// bit 63 (or any bit >= 60) of the unshifted result is enough
+// to distinguish -1 from +1; this is then made into a mask.
+
+        movq    F(%rsp), %rax
+        movq    G(%rsp), %rcx
+        imulq   %r8, %rax
+        imulq   %r10, %rcx
+        addq    %rcx, %rax
+        sarq    $63, %rax
+
+// Now separate out the matrix into sign-magnitude pairs
+// and adjust each one based on the sign of f.
+//
+// Note that at this point we expect |f|=1 and we got its
+// sign above, so then since [f,0] == x * 2^{-512} [u,v] (mod p_sm2)
+// we want to flip the sign of u according to that of f.
+
+        movq    %r8, %r9
+        sarq    $63, %r9
+        xorq    %r9, %r8
+        subq    %r9, %r8
+        xorq    %rax, %r9
+
+        movq    %r10, %r11
+        sarq    $63, %r11
+        xorq    %r11, %r10
+        subq    %r11, %r10
+        xorq    %rax, %r11
+
+        movq    %r12, %r13
+        sarq    $63, %r13
+        xorq    %r13, %r12
+        subq    %r13, %r12
+        xorq    %rax, %r13
+
+        movq    %r14, %r15
+        sarq    $63, %r15
+        xorq    %r15, %r14
+        subq    %r15, %r14
+        xorq    %rax, %r15
+
+// Adjust the initial value to allow for complement instead of negation
+
+        movq    %r8, %rax
+        andq    %r9, %rax
+        movq    %r10, %r12
+        andq    %r11, %r12
+        addq    %rax, %r12
+
+// Digit 0 of [u]
+
+        xorl    %r13d, %r13d
+        movq    U(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        movq    V(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+
+// Digit 1 of [u]
+
+        xorl    %r14d, %r14d
+        movq    U+N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        movq    V+N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+
+// Digit 2 of [u]
+
+        xorl    %r15d, %r15d
+        movq    U+2*N(%rsp), %rax
+        xorq    %r9, %rax
+        mulq    %r8
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    V+2*N(%rsp), %rax
+        xorq    %r11, %rax
+        mulq    %r10
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+
+// Digits 3 and 4 of u (top is unsigned)
+
+        movq    U+3*N(%rsp), %rax
+        xorq    %r9, %rax
+        andq    %r8, %r9
+        negq    %r9
+        mulq    %r8
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+        movq    V+3*N(%rsp), %rax
+        xorq    %r11, %rax
+        movq    %r11, %rdx
+        andq    %r10, %rdx
+        subq    %rdx, %r9
+        mulq    %r10
+        addq    %rax, %r15
+        adcq    %rdx, %r9
+
+// Store back and Montgomery reduce u
+
+        movq    %r12, U(%rsp)
+        movq    %r13, U+N(%rsp)
+        movq    %r14, U+2*N(%rsp)
+        movq    %r15, U+3*N(%rsp)
+        movq    %r9, U+4*N(%rsp)
+
+        amontred(u)
+
+// Perform final strict reduction mod p_sm2 and copy to output
+
+        movq    U(%rsp), %r8
+        movq    U+N(%rsp), %r9
+        movq    U+2*N(%rsp), %r10
+        movq    U+3*N(%rsp), %r11
+
+        movl    $1, %eax
+        movl    $0x00000000ffffffff, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        bts     $32, %rdx
+
+        addq    %r8, %rax
+        adcq    %r9, %rbx
+        adcq    %r10, %rcx
+        adcq    %r11, %rdx
+
+        cmovncq %r8, %rax
+        cmovncq %r9, %rbx
+        cmovncq %r10, %rcx
+        cmovncq %r11, %rdx
+
+        movq    res, %rdi
+        movq    %rax, (%rdi)
+        movq    %rbx, N(%rdi)
+        movq    %rcx, 2*N(%rdi)
+        movq    %rdx, 3*N(%rdi)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2.S
new file mode 100644
index 00000000000..e381d54bb82
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2.S
@@ -0,0 +1,212 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_sm2, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_sm2 (in particular this is true if we are in
+// the "usual" case x < p_sm2 and y < p_sm2).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// We move the y argument here so we can use %rdx for multipliers
+
+#define y %rcx
+
+// Use this fairly consistently for a zero
+
+#define zero %rbp
+#define zeroe %ebp
+
+// mulpadd(high,low,m) adds %rdx * m to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rbx as temporaries.
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rbx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rbx, high
+
+// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax as a temporary, assuming high created from scratch
+// and that zero has value zero.
+
+#define mulpade(high,low,m)             \
+        mulxq   m, %rax, high ;           \
+        adcxq   %rax, low ;               \
+        adoxq   zero, high
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rbx
+// as temporaries.
+// ---------------------------------------------------------------------------
+
+#define montreds(d3,d2,d1,d0)                                               \
+        movq    d0, %rax ;                                                    \
+        shlq    $32, %rax ;                                                    \
+        movq    d0, %rcx ;                                                    \
+        shrq    $32, %rcx ;                                                    \
+        movq    %rax, %rdx ;                                                   \
+        movq    %rcx, %rbx ;                                                   \
+        subq    d0, %rax ;                                                    \
+        sbbq    $0, %rcx ;                                                     \
+        subq    %rax, d1 ;                                                    \
+        sbbq    %rcx, d2 ;                                                    \
+        sbbq    %rdx, d3 ;                                                    \
+        sbbq    %rbx, d0
+
+S2N_BN_SYMBOL(bignum_montmul_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Zero a register, which also makes sure we don't get a fake carry-in
+
+        xorl    zeroe, zeroe
+
+// Do the zeroth row, which is a bit different
+
+        movq    (y), %rdx
+
+        mulxq   (x), %r8, %r9
+        mulxq   8(x), %rax, %r10
+        addq    %rax, %r9
+        mulxq   16(x), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   24(x), %rax, %r12
+        adcq    %rax, %r11
+        adcq    zero, %r12
+
+// Add row 1
+
+        xorl    zeroe, zeroe
+        movq    8(y), %rdx
+        mulpadd(%r10,%r9,(x))
+        mulpadd(%r11,%r10,8(x))
+        mulpadd(%r12,%r11,16(x))
+        mulpade(%r13,%r12,24(x))
+        adcxq   zero, %r13
+
+// Add row 2
+
+        xorl    zeroe, zeroe
+        movq    16(y), %rdx
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        mulpadd(%r13,%r12,16(x))
+        mulpade(%r14,%r13,24(x))
+        adcxq   zero, %r14
+
+// Add row 3
+
+        xorl    zeroe, zeroe
+        movq    24(y), %rdx
+        mulpadd(%r12,%r11,(x))
+        mulpadd(%r13,%r12,8(x))
+        mulpadd(%r14,%r13,16(x))
+        mulpade(%r15,%r14,24(x))
+        adcxq   zero, %r15
+
+// Multiplication complete. Perform 4 Montgomery steps to rotate the lower half
+
+        montreds(%r11,%r10,%r9,%r8)
+        montreds(%r8,%r11,%r10,%r9)
+        montreds(%r9,%r8,%r11,%r10)
+        montreds(%r10,%r9,%r8,%r11)
+
+// Add high and low parts, catching carry in %rax
+
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+
+// Load [%r8;%r11;%rbp;%rdx;%rcx] = 2^320 - p_sm2 then do
+// [%r8;%r11;%rbp;%rdx;%rcx] = [%rax;%r15;%r14;%r13;%r12] + (2^320 - p_sm2)
+
+        movl    $1, %ecx
+        movl    $0x00000000FFFFFFFF, %edx
+        xorl    %ebp, %ebp
+        addq    %r12, %rcx
+        leaq    1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -1(%rbp), %r8
+        adcq    %r14, %rbp
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+
+// Now carry is set if r + (2^320 - p_sm2) >= 2^320, i.e. r >= p_sm2
+// where r is the pre-reduced form. So conditionally select the
+// output accordingly.
+
+        cmovcq  %rcx, %r12
+        cmovcq  %rdx, %r13
+        cmovcq  %rbp, %r14
+        cmovcq  %r11, %r15
+
+// Write back reduced value
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore saved registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2_alt.S
new file mode 100644
index 00000000000..23ce5a8dd30
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2_alt.S
@@ -0,0 +1,214 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery multiply, z := (x * y / 2^256) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_montmul_sm2_alt
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Does z := (2^{-256} * x * y) mod p_sm2, assuming that the inputs x and y
+// satisfy x * y <= 2^256 * p_sm2 (in particular this is true if we are in
+// the "usual" case x < p_sm2 and y < p_sm2).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_sm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_sm2_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// We move the y argument here so we can use %rdx for multipliers
+
+#define y %rcx
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    $0, c
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h ;                         \
+        adcq    c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        movq    numa, %rax ;                      \
+        mulq     numb;                 \
+        addq    %rax, l ;                         \
+        adcq    %rdx, h
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rbx
+// as temporaries.
+// ---------------------------------------------------------------------------
+
+#define montreds(d3,d2,d1,d0)                                               \
+        movq    d0, %rax ;                                                    \
+        shlq    $32, %rax ;                                                    \
+        movq    d0, %rcx ;                                                    \
+        shrq    $32, %rcx ;                                                    \
+        movq    %rax, %rdx ;                                                   \
+        movq    %rcx, %rbx ;                                                   \
+        subq    d0, %rax ;                                                    \
+        sbbq    $0, %rcx ;                                                     \
+        subq    %rax, d1 ;                                                    \
+        sbbq    %rcx, d2 ;                                                    \
+        sbbq    %rdx, d3 ;                                                    \
+        sbbq    %rbx, d0
+
+S2N_BN_SYMBOL(bignum_montmul_sm2_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Copy y into a safe register to start with
+
+        movq    %rdx, y
+
+// Start the window as [%r10;%r9;%r8] with 00 product
+
+        movq    (x), %rax
+        mulq     (y)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+
+// Column 1
+
+        xorq    %r11, %r11
+        combads(%r10,%r9,(x),8(y))
+        combadz(%r11,%r10,%r9,8(x),(y))
+
+// Column 2
+
+        xorq    %r12, %r12
+        combadz(%r12,%r11,%r10,(x),16(y))
+        combadd(%r12,%r11,%r10,8(x),8(y))
+        combadd(%r12,%r11,%r10,16(x),(y))
+
+// Column 3
+
+        xorq    %r13, %r13
+        combadz(%r13,%r12,%r11,(x),24(y))
+        combadd(%r13,%r12,%r11,8(x),16(y))
+        combadd(%r13,%r12,%r11,16(x),8(y))
+        combadd(%r13,%r12,%r11,24(x),(y))
+
+// Column 4
+
+        xorq    %r14, %r14
+        combadz(%r14,%r13,%r12,8(x),24(y))
+        combadd(%r14,%r13,%r12,16(x),16(y))
+        combadd(%r14,%r13,%r12,24(x),8(y))
+
+// Column 5
+
+        xorq    %r15, %r15
+        combadz(%r15,%r14,%r13,16(x),24(y))
+        combadd(%r15,%r14,%r13,24(x),16(y))
+
+// Final work for columns 6 and 7
+
+        movq    24(x), %rax
+        mulq     24(y)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+
+// Multiplication complete. Perform 4 Montgomery steps to rotate the lower half
+
+        montreds(%r11,%r10,%r9,%r8)
+        montreds(%r8,%r11,%r10,%r9)
+        montreds(%r9,%r8,%r11,%r10)
+        montreds(%r10,%r9,%r8,%r11)
+
+// Add high and low parts, catching carry in %rax
+
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+
+// Load [%r8;%r11;%rbx;%rdx;%rcx] = 2^320 - p_sm2 then do
+// [%r8;%r11;%rbx;%rdx;%rcx] = [%rax;%r15;%r14;%r13;%r12] + (2^320 - p_sm2)
+
+        movl    $1, %ecx
+        movl    $0x00000000FFFFFFFF, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+
+// Now carry is set if r + (2^320 - p_sm2) >= 2^320, i.e. r >= p_sm2
+// where r is the pre-reduced form. So conditionally select the
+// output accordingly.
+
+        cmovcq  %rcx, %r12
+        cmovcq  %rdx, %r13
+        cmovcq  %rbx, %r14
+        cmovcq  %r11, %r15
+
+// Write back reduced value
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore saved registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2.S
new file mode 100644
index 00000000000..5ecefb2c68c
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2.S
@@ -0,0 +1,194 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_sm2, assuming x^2 <= 2^256 * p_sm2, which is
+// guaranteed in particular if x < p_sm2 initially (the "intended" case).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Use this fairly consistently for a zero
+
+#define zero %rbp
+#define zeroe %ebp
+
+// Add %rdx * m into a register-pair (high,low)
+// maintaining consistent double-carrying with adcx and adox,
+// using %rax and %rbx as temporaries
+
+#define mulpadd(high,low,m)             \
+        mulxq   m, %rax, %rbx ;            \
+        adcxq   %rax, low ;               \
+        adoxq   %rbx, high
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rbx
+// as temporaries.
+// ---------------------------------------------------------------------------
+
+#define montreds(d3,d2,d1,d0)                                               \
+        movq    d0, %rax ;                                                    \
+        shlq    $32, %rax ;                                                    \
+        movq    d0, %rcx ;                                                    \
+        shrq    $32, %rcx ;                                                    \
+        movq    %rax, %rdx ;                                                   \
+        movq    %rcx, %rbx ;                                                   \
+        subq    d0, %rax ;                                                    \
+        sbbq    $0, %rcx ;                                                     \
+        subq    %rax, d1 ;                                                    \
+        sbbq    %rcx, d2 ;                                                    \
+        sbbq    %rdx, d3 ;                                                    \
+        sbbq    %rbx, d0
+
+S2N_BN_SYMBOL(bignum_montsqr_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Compute [%r15;%r8] = [00] which we use later, but mainly
+// set up an initial window [%r14;...;%r9] = [23;03;01]
+
+        movq    (x), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   8(x), %r9, %r10
+        mulxq   24(x), %r11, %r12
+        movq    16(x), %rdx
+        mulxq   24(x), %r13, %r14
+
+// Clear our zero register, and also initialize the flags for the carry chain
+
+        xorl    zeroe, zeroe
+
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        mulpadd(%r11,%r10,(x))
+        mulpadd(%r12,%r11,8(x))
+        movq    24(x), %rdx
+        mulpadd(%r13,%r12,8(x))
+        adcxq   zero, %r13
+        adoxq   zero, %r14
+        adcq    zero, %r14
+
+// Double and add to the 00 + 11 + 22 + 33 terms
+
+        xorl    zeroe, zeroe
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    8(x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    16(x), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    24(x), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   zero, %r15
+        adoxq   zero, %r15
+
+// Squaring complete. Perform 4 Montgomery steps to rotate the lower half
+
+        montreds(%r11,%r10,%r9,%r8)
+        montreds(%r8,%r11,%r10,%r9)
+        montreds(%r9,%r8,%r11,%r10)
+        montreds(%r10,%r9,%r8,%r11)
+
+// Add high and low parts, catching carry in %rax
+
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+
+// Load [%r8;%r11;%rbp;%rdx;%rcx] = 2^320 - p_sm2 then do
+// [%r8;%r11;%rbp;%rdx;%rcx] = [%rax;%r15;%r14;%r13;%r12] + (2^320 - p_sm2)
+
+        movl    $1, %ecx
+        movl    $0x00000000FFFFFFFF, %edx
+        xorl    %ebp, %ebp
+        addq    %r12, %rcx
+        leaq    1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -1(%rbp), %r8
+        adcq    %r14, %rbp
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+
+// Now carry is set if r + (2^320 - p_sm2) >= 2^320, i.e. r >= p_sm2
+// where r is the pre-reduced form. So conditionally select the
+// output accordingly.
+
+        cmovcq  %rcx, %r12
+        cmovcq  %rdx, %r13
+        cmovcq  %rbp, %r14
+        cmovcq  %r11, %r15
+
+// Write back reduced value
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore saved registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2_alt.S
new file mode 100644
index 00000000000..cd970f10265
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2_alt.S
@@ -0,0 +1,223 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery square, z := (x^2 / 2^256) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_montsqr_sm2_alt
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Does z := (x^2 / 2^256) mod p_sm2, assuming x^2 <= 2^256 * p_sm2, which is
+// guaranteed in particular if x < p_sm2 initially (the "intended" case).
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_sm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_sm2_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Add %rbx * m into a register-pair (high,low) maintaining consistent
+// carry-catching with carry (negated, as bitmask) and using %rax and %rdx
+// as temporaries
+
+#define mulpadd(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rbx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// Initial version assuming no carry-in
+
+#define mulpadi(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rbx;                    \
+        addq    %rax, low ;               \
+        adcq    %rdx, high ;              \
+        sbbq    carry, carry
+
+// End version not catching the top carry-out
+
+#define mulpade(carry,high,low,m)       \
+        movq    m, %rax ;                 \
+        mulq    %rbx;                    \
+        subq    carry, %rdx ;             \
+        addq    %rax, low ;               \
+        adcq    %rdx, high
+
+// ---------------------------------------------------------------------------
+// Core one-step "short" Montgomery reduction macro. Takes input in
+// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the
+// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rbx
+// as temporaries.
+// ---------------------------------------------------------------------------
+
+#define montreds(d3,d2,d1,d0)                                               \
+        movq    d0, %rax ;                                                    \
+        shlq    $32, %rax ;                                                    \
+        movq    d0, %rcx ;                                                    \
+        shrq    $32, %rcx ;                                                    \
+        movq    %rax, %rdx ;                                                   \
+        movq    %rcx, %rbx ;                                                   \
+        subq    d0, %rax ;                                                    \
+        sbbq    $0, %rcx ;                                                     \
+        subq    %rax, d1 ;                                                    \
+        sbbq    %rcx, d2 ;                                                    \
+        sbbq    %rdx, d3 ;                                                    \
+        sbbq    %rbx, d0
+
+S2N_BN_SYMBOL(bignum_montsqr_sm2_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save more registers to play with
+
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+
+// Compute [%r15;%r8] = [00] which we use later, but mainly
+// set up an initial window [%r14;...;%r9] = [23;03;01]
+
+        movq    (x), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    8(x), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    24(x), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    16(x), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+
+// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible)
+// This gives all the "heterogeneous" terms of the squaring ready to double
+
+        mulpadi(%rcx,%r11,%r10,(x))
+        mulpadd(%rcx,%r12,%r11,8(x))
+        movq    24(x), %rbx
+        mulpade(%rcx,%r13,%r12,8(x))
+        adcq    $0, %r14
+
+// Double the window [%r14;...;%r9], catching top carry in %rcx
+
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+
+// Add to the 00 + 11 + 22 + 33 terms
+
+        movq    8(x), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    16(x), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    24(x), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+
+// Squaring complete. Perform 4 Montgomery steps to rotate the lower half
+
+        montreds(%r11,%r10,%r9,%r8)
+        montreds(%r8,%r11,%r10,%r9)
+        montreds(%r9,%r8,%r11,%r10)
+        montreds(%r10,%r9,%r8,%r11)
+
+// Add high and low parts, catching carry in %rax
+
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+
+// Load [%r8;%r11;%rbx;%rdx;%rcx] = 2^320 - p_sm2 then do
+// [%r8;%r11;%rbx;%rdx;%rcx] = [%rax;%r15;%r14;%r13;%r12] + (2^320 - p_sm2)
+
+        movl    $1, %ecx
+        movl    $0x00000000FFFFFFFF, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+
+// Now carry is set if r + (2^320 - p_sm2) >= 2^320, i.e. r >= p_sm2
+// where r is the pre-reduced form. So conditionally select the
+// output accordingly.
+
+        cmovcq  %rcx, %r12
+        cmovcq  %rdx, %r13
+        cmovcq  %rbx, %r14
+        cmovcq  %r11, %r15
+
+// Write back reduced value
+
+        movq    %r12, (z)
+        movq    %r13, 8(z)
+        movq    %r14, 16(z)
+        movq    %r15, 24(z)
+
+// Restore saved registers and return
+
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_neg_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_neg_sm2.S
new file mode 100644
index 00000000000..05a90e432b0
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_neg_sm2.S
@@ -0,0 +1,90 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced
+// Input x[4]; output z[4]
+//
+//    extern void bignum_neg_sm2 (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define q %rdx
+
+#define d0 %rax
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %r11
+
+S2N_BN_SYMBOL(bignum_neg_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the input digits as [d3;d2;d1;d0] and also set a bitmask q
+// for the input being nonzero, so that we avoid doing -0 = p_sm2
+// and hence maintain strict modular reduction
+
+        movq    (x), d0
+        movq    8(x), d1
+        movq    d0, n1
+        orq     d1, n1
+        movq    16(x), d2
+        movq    24(x), d3
+        movq    d2, n3
+        orq     d3, n3
+        orq     n1, n3
+        negq    n3
+        sbbq    q, q
+
+// Load the non-trivial words of p_sm2 = [n3;-1;n1;-1] and mask them with q
+
+        movq    $0xffffffff00000000, n1
+        movq    $0xfffffffeffffffff, n3
+        andq    q, n1
+        andq    q, n3
+
+// Do the subtraction, using an xor for the first digit and getting the
+// overall result as [n3;q;n1;d0], all these tweaks just to avoid moves
+
+        xorq    q, d0
+        subq    d1, n1
+        sbbq    d2, q
+        sbbq    d3, n3
+
+// Write back
+
+        movq    d0, (z)
+        movq    n1, 8(z)
+        movq    q, 16(z)
+        movq    n3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_optneg_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_optneg_sm2.S
new file mode 100644
index 00000000000..f58342adc20
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_optneg_sm2.S
@@ -0,0 +1,100 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or
+// z := x (if p zero), assuming x reduced
+// Inputs p, x[4]; output z[4]
+//
+//    extern void bignum_optneg_sm2
+//     (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = p, RDX = x
+// Microsoft x64 ABI:   RCX = z, RDX = p, R8 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_sm2)
+        .text
+
+#define z %rdi
+#define q %rsi
+#define x %rdx
+
+#define n0 %rax
+#define n1 %rcx
+#define n2 %r8
+#define n3 %r9
+
+S2N_BN_SYMBOL(bignum_optneg_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Adjust q by zeroing it if the input is zero (to avoid giving -0 = p_sm2,
+// which is not strictly reduced even though it's correct modulo p_sm2).
+// This step is redundant if we know a priori that the input is nonzero, which
+// is the case for the y coordinate of points on the SM2 curve, for example.
+
+        movq    (x), n0
+        orq     8(x), n0
+        movq    16(x), n1
+        orq     24(x), n1
+        orq     n1, n0
+        negq    n0
+        sbbq    n0, n0
+        andq    n0, q
+
+// Turn q into a bitmask, all 1s for q=false, all 0s for q=true
+
+        negq    q
+        sbbq    q, q
+        notq    q
+
+// Let [n3;n2;n1;n0] = if q then p_sm2 else -1
+
+        movq    $0xffffffffffffffff, n0
+        movq    $0xffffffff00000000, n1
+        orq     q, n1
+        movq    n0, n2
+        movq    $0xfffffffeffffffff, n3
+        orq     q, n3
+
+// Subtract so [n3;n2;n1;n0] = if q then p_sm2 - x else -1 - x
+
+        subq    (x), n0
+        sbbq    8(x), n1
+        sbbq    16(x), n2
+        sbbq    24(x), n3
+
+// XOR the words with the bitmask, which in the case q = false has the
+// effect of restoring ~(-1 - x) = -(-1 - x) - 1 = 1 + x - 1 = x
+// and write back the digits to the output
+
+        xorq    q, n0
+        movq    n0, (z)
+        xorq    q, n1
+        movq    n1, 8(z)
+        xorq    q, n2
+        movq    n2, 16(z)
+        xorq    q, n3
+        movq    n3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_sub_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_sub_sm2.S
new file mode 100644
index 00000000000..d6898b4a070
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_sub_sm2.S
@@ -0,0 +1,89 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Subtract modulo p_sm2, z := (x - y) mod p_sm2
+// Inputs x[4], y[4]; output z[4]
+//
+//    extern void bignum_sub_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+#define y %rdx
+
+#define d0 %rax
+#define d1 %rcx
+#define d2 %r8
+#define d3 %r9
+
+#define n1 %r10
+#define n3 %rdx
+#define c %r11
+
+#define n1short %r10d
+
+
+
+S2N_BN_SYMBOL(bignum_sub_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Load and subtract the two inputs as [d3;d2;d1;d0] = x - y (modulo 2^256)
+
+        movq    (x), d0
+        subq    (y), d0
+        movq    8(x), d1
+        sbbq    8(y), d1
+        movq    16(x), d2
+        sbbq    16(y), d2
+        movq    24(x), d3
+        sbbq    24(y), d3
+
+// Capture the carry, which indicates x < y, and create corresponding masked
+// correction p_sm2' = [n3; c; n1; c] to add
+
+        movq    $0xffffffff00000000, n1
+        sbbq    c, c
+        andq    c, n1
+        movq    c, n3
+        btr     $32, n3
+
+// Do the corrective addition and copy to output
+
+        addq    c, d0
+        movq    d0, (z)
+        adcq    n1, d1
+        movq    d1, 8(z)
+        adcq    c, d2
+        movq    d2, 16(z)
+        adcq    n3, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_tomont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_tomont_sm2.S
new file mode 100644
index 00000000000..514c8b93c09
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_tomont_sm2.S
@@ -0,0 +1,144 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Convert to Montgomery form z := (2^256 * x) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_tomont_sm2
+//     (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+#define m0 %r8
+#define m1 %r9
+#define m2 %r10
+#define m3 %r11
+
+#define q %rax
+#define n1 %rcx
+#define n3 %rdx
+#define qshort %eax
+
+// ----------------------------------------------------------------------------
+// Core "x |-> (2^64 * x) mod p_sm2" macro, with x assumed to be < p_sm2.
+// Input is [d3;d2;d1;d0] and output is [d2;d1;d0;q] where q is a fixed
+// quotient estimate register (%rax), so the registers get shuffled.
+// ----------------------------------------------------------------------------
+
+#define modstep_sm2(d3,d2,d1,d0)                                        \
+/* Writing the input, with lower zero digit appended, as        */      \
+/* z = 2^256 * d3 + 2^192 * d2 + t, quotient approximation is   */      \
+/* MIN ((d3 * (1 + 2^32 + 2^64) + d2 + 2^64) >> 64) (2^64 - 1)  */      \
+        movq    d2, n1 ;                                                 \
+        movl    $1, qshort ;                                              \
+        addq    d3, n1 ;                                                 \
+        adcq    d3, q ;                                                  \
+        shrq    $32, n1 ;                                                 \
+        addq    d3, n1 ;                                                 \
+        shrq    $32, n1 ;                                                 \
+        addq    n1, q ;                                                  \
+        sbbq    $0, q ;                                                   \
+/* Compute the pre-reduced [d3;d2;d1;d0;q] = m - p_sm2 * q      */      \
+/* = z - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q                  */      \
+        movq    q, n1 ;                                                  \
+        movq    q, n3 ;                                                  \
+        shlq    $32, n1 ;                                                 \
+        shrq    $32, n3 ;                                                 \
+        addq    n1, d2 ;                                                 \
+        adcq    n3, d3 ;                                                 \
+        subq    q, n1 ;                                                  \
+        sbbq    $0, n3 ;                                                  \
+        subq    q, d3 ;                                                  \
+        addq    n1, d0 ;                                                 \
+        adcq    n3, d1 ;                                                 \
+        adcq    $0, d2 ;                                                  \
+        adcq    $0, d3 ;                                                  \
+/* Corrective addition with top word d3 as a bitmask            */      \
+        movq    $0xffffffff00000000, n1 ;                                 \
+        andq    d3, n1 ;                                                 \
+        movq    $0xfffffffeffffffff, n3 ;                                 \
+        andq    d3, n3 ;                                                 \
+        addq    d3, q ;                                                  \
+        adcq    n1, d0 ;                                                 \
+        adcq    d3, d1 ;                                                 \
+        adcq    n3, d2
+
+S2N_BN_SYMBOL(bignum_tomont_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Load the inputs
+
+        movq    (x), m0
+        movq    8(x), m1
+        movq    16(x), m2
+        movq    24(x), m3
+
+// Load non-trivial digits [n3; -1; n1; -1] = p_sm2 and do a conditional
+// subtraction to reduce the four starting digits [m3;m2;m1;m0] modulo p_sm2
+
+        subq    $-1, m0
+        movq    $0xffffffff00000000, n1
+        sbbq    n1, m1
+        movq    $0xfffffffeffffffff, n3
+        sbbq    $-1, m2
+        sbbq    n3, m3
+        sbbq    q, q
+        andq    q, n1
+        andq    q, n3
+        addq    q, m0
+        adcq    n1, m1
+        adcq    q, m2
+        adcq    n3, m3
+
+// Now do 4 iterations of 5->4 word modular reduction
+
+        modstep_sm2(m3,m2,m1,m0)
+
+        movq    q, m3
+
+        modstep_sm2(m2,m1,m0,m3)
+
+        movq    q, m2
+
+        modstep_sm2(m1,m0,m3,m2)
+
+        movq    q, m1
+
+        modstep_sm2(m0,m3,m2,m1)
+
+// Write back result and return
+
+        movq    q, (z)
+        movq    m1, 8(z)
+        movq    m2, 16(z)
+        movq    m3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2.S
new file mode 100644
index 00000000000..4e19aa85f19
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2.S
@@ -0,0 +1,128 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Triple modulo p_sm2, z := (3 * x) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_triple_sm2
+//      (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The input x can be any 4-digit bignum, not necessarily reduced modulo p_sm2,
+// and the result is always fully reduced, i.e. z = (3 * x) mod p_sm2.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_sm2)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_sm2)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Main digits of intermediate results
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// Quotient estimate = top of product + 1
+
+#define q %rdx
+#define h %rdx
+#define qshort %edx
+
+// Other temporary variables and their short version
+
+#define a %rax
+#define c %rcx
+
+#define ashort %eax
+
+S2N_BN_SYMBOL(bignum_triple_sm2):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// First do the multiplication by 3, getting z = [h; d3; ...; d0]
+// but immediately form the quotient estimate q = h + 1
+
+        xorl    ashort, ashort
+
+        movq    (x), q
+        movq    q, d0
+        adcxq   q, q
+        adoxq   q, d0
+        movq    8(x), q
+        movq    q, d1
+        adcxq   q, q
+        adoxq   q, d1
+        movq    16(x), q
+        movq    q, d2
+        adcxq   q, q
+        adoxq   q, d2
+        movq    24(x), q
+        movq    q, d3
+        adcxq   q, q
+        adoxq   q, d3
+
+// For this limited range a simple quotient estimate of q = h + 1 works, where
+// h = floor(z / 2^256). Then -p_sm2 <= z - q * p_sm2 < p_sm2, so we just need
+// to subtract q * p_sm2 and then if that's negative, add back p_sm2.
+
+        movl    $1, qshort
+        adcxq   a, q
+        adoxq   a, q
+
+// Now compute the initial pre-reduced [h;d3;d2;d1;d0] = z - p_sm2 * q
+// = z - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q
+
+        movq    q, a
+        shlq    $32, a
+        movq    a, c
+        subq    q, a
+
+        addq    q, d0
+        adcq    a, d1
+        adcq    $0, d2
+        adcq    c, d3
+        sbbq    h, h
+        notq    h
+
+// Now our top word h is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative, as a bitmask
+// Do a masked addition of p_sm2
+
+        movq    $0xffffffff00000000, a
+        andq    h, a
+        movq    $0xfffffffeffffffff, c
+        andq    h, c
+        addq    h, d0
+        movq    d0, (z)
+        adcq    a, d1
+        movq    d1, 8(z)
+        adcq    h, d2
+        movq    d2, 16(z)
+        adcq    c, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2_alt.S
new file mode 100644
index 00000000000..a06d91f8097
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2_alt.S
@@ -0,0 +1,131 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Triple modulo p_sm2, z := (3 * x) mod p_sm2
+// Input x[4]; output z[4]
+//
+//    extern void bignum_triple_sm2_alt
+//      (uint64_t z[static 4], uint64_t x[static 4]);
+//
+// The input x can be any 4-digit bignum, not necessarily reduced modulo p_sm2,
+// and the result is always fully reduced, i.e. z = (3 * x) mod p_sm2.
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_sm2_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_sm2_alt)
+        .text
+
+#define z %rdi
+#define x %rsi
+
+// Main digits of intermediate results
+
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+
+// Quotient estimate = top of product + 1
+
+#define q %rdx
+#define h %rdx
+
+// Other temporary variables and their short version
+
+#define a %rax
+#define c %rcx
+#define d %rdx
+
+#define ashort %eax
+#define cshort %ecx
+
+S2N_BN_SYMBOL(bignum_triple_sm2_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// First do the multiplication by 3, getting z = [h; d3; ...; d0]
+// but immediately form the quotient estimate q = h + 1
+
+        movl    $3, cshort
+
+        movq    (x), a
+        mulq    c
+        movq    a, d0
+        movq    d, d1
+
+        movq    8(x), a
+        xorq    d2, d2
+        mulq    c
+        addq    a, d1
+        adcq    d, d2
+
+        movq    16(x), a
+        xorq    d3, d3
+        mulq    c
+        addq    a, d2
+        adcq    d, d3
+
+        movq    24(x), a
+        mulq    c
+        addq    a, d3
+
+// For this limited range a simple quotient estimate of q = h + 1 works, where
+// h = floor(z / 2^256). Then -p_sm2 <= z - q * p_sm2 < p_sm2, so we just need
+// to subtract q * p_sm2 and then if that's negative, add back p_sm2.
+
+        adcq    $1, q
+
+// Now compute the initial pre-reduced [h;d3;d2;d1;d0] = z - p_sm2 * q
+// = z - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q
+
+        movq    q, a
+        shlq    $32, a
+        movq    a, c
+        subq    q, a
+
+        addq    q, d0
+        adcq    a, d1
+        adcq    $0, d2
+        adcq    c, d3
+        sbbq    h, h
+        notq    h
+
+// Now our top word h is either zero or all 1s, and we use this to discriminate
+// whether a correction is needed because our result is negative, as a bitmask
+// Do a masked addition of p_sm2
+
+        movq    $0xffffffff00000000, a
+        andq    h, a
+        movq    $0xfffffffeffffffff, c
+        andq    h, c
+        addq    h, d0
+        movq    d0, (z)
+        adcq    a, d1
+        movq    d1, 8(z)
+        adcq    h, d2
+        movq    d2, 16(z)
+        adcq    c, d3
+        movq    d3, 24(z)
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd.S
new file mode 100644
index 00000000000..75313535e71
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd.S
@@ -0,0 +1,621 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjadd)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// which needs to be set up explicitly before use.
+// By design, none of the code macros modify any of
+// these, so we maintain the assignments throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+#define z_2 (2*NUMSIZE)(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define x1a (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define z2sq (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define y1a (NUMSIZE*6)(%rsp)
+
+#define NSPACE (NUMSIZE*7)
+
+// Corresponds to bignum_montmul_sm2 except for registers
+
+#define montmul_sm2(P0,P1,P2)                   \
+        xorl    %ecx, %ecx ;                       \
+        movq    P2, %rdx ;                      \
+        mulxq   P1, %r8, %r9 ;                   \
+        mulxq   0x8+P1, %rax, %r10 ;             \
+        addq    %rax, %r9 ;                        \
+        mulxq   0x10+P1, %rax, %r11 ;            \
+        adcq    %rax, %r10 ;                       \
+        mulxq   0x18+P1, %rax, %r12 ;            \
+        adcq    %rax, %r11 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        movq    0x8+P2, %rdx ;                  \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rbx, %r10 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x10+P1, %rax, %rbx ;            \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        mulxq   0x18+P1, %rax, %r13 ;            \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rcx, %r13 ;                       \
+        adcxq   %rcx, %r13 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        movq    0x10+P2, %rdx ;                 \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        mulxq   0x10+P1, %rax, %rbx ;            \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        mulxq   0x18+P1, %rax, %r14 ;            \
+        adcxq   %rax, %r13 ;                       \
+        adoxq   %rcx, %r14 ;                       \
+        adcxq   %rcx, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        movq    0x18+P2, %rdx ;                 \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        mulxq   0x10+P1, %rax, %rbx ;            \
+        adcxq   %rax, %r13 ;                       \
+        adoxq   %rbx, %r14 ;                       \
+        mulxq   0x18+P1, %rax, %r15 ;            \
+        adcxq   %rax, %r14 ;                       \
+        adoxq   %rcx, %r15 ;                       \
+        adcxq   %rcx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds to bignum_montsqr_sm2 except for registers
+
+#define montsqr_sm2(P0,P1)                      \
+        movq    P1, %rdx ;                      \
+        mulxq   %rdx, %r8, %r15 ;                   \
+        mulxq   0x8+P1, %r9, %r10 ;              \
+        mulxq   0x18+P1, %r11, %r12 ;            \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   0x18+P1, %r13, %r14 ;            \
+        xorl    %ecx, %ecx ;                       \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        adcxq   %rcx, %r13 ;                       \
+        adoxq   %rcx, %r14 ;                       \
+        adcq    %rcx, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        adcxq   %r9, %r9 ;                         \
+        adoxq   %r15, %r9 ;                        \
+        movq    0x8+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r10, %r10 ;                       \
+        adoxq   %rax, %r10 ;                       \
+        adcxq   %r11, %r11 ;                       \
+        adoxq   %rdx, %r11 ;                       \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r12, %r12 ;                       \
+        adoxq   %rax, %r12 ;                       \
+        adcxq   %r13, %r13 ;                       \
+        adoxq   %rdx, %r13 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %r15 ;                  \
+        adcxq   %r14, %r14 ;                       \
+        adoxq   %rax, %r14 ;                       \
+        adcxq   %rcx, %r15 ;                       \
+        adoxq   %rcx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe).
+
+#define amontsqr_sm2(P0,P1)                     \
+        movq    P1, %rdx ;                      \
+        mulxq   %rdx, %r8, %r15 ;                   \
+        mulxq   0x8+P1, %r9, %r10 ;              \
+        mulxq   0x18+P1, %r11, %r12 ;            \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   0x18+P1, %r13, %r14 ;            \
+        xorl    %ecx, %ecx ;                       \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        adcxq   %rcx, %r13 ;                       \
+        adoxq   %rcx, %r14 ;                       \
+        adcq    %rcx, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        adcxq   %r9, %r9 ;                         \
+        adoxq   %r15, %r9 ;                        \
+        movq    0x8+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r10, %r10 ;                       \
+        adoxq   %rax, %r10 ;                       \
+        adcxq   %r11, %r11 ;                       \
+        adoxq   %rdx, %r11 ;                       \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r12, %r12 ;                       \
+        adoxq   %rax, %r12 ;                       \
+        adcxq   %r13, %r13 ;                       \
+        adoxq   %rdx, %r13 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %r15 ;                  \
+        adcxq   %r14, %r14 ;                       \
+        adoxq   %rax, %r14 ;                       \
+        adcxq   %rcx, %r15 ;                       \
+        adoxq   %rcx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        sbbq    %rax, %rax ;                       \
+        movq    $0xffffffff00000000, %rbx ;        \
+        movq    %rax, %rcx ;                       \
+        andq    %rax, %rbx ;                       \
+        btr     $32, %rcx ;                        \
+        subq    %rax, %r12 ;                       \
+        sbbq    %rbx, %r13 ;                       \
+        sbbq    %rax, %r14 ;                       \
+        sbbq    %rcx, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        movq    P1, %rax ;                      \
+        subq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        sbbq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        sbbq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        sbbq    0x18+P2, %r9 ;                  \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r11, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        movq    %r11, %rdx ;                       \
+        btr     $0x20, %rdx ;                      \
+        addq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        adcq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        adcq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        adcq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+#define czload4(r0,r1,r2,r3,P)                  \
+        cmovzq  P, r0 ;                        \
+        cmovzq  8+P, r1 ;                      \
+        cmovzq  16+P, r2 ;                     \
+        cmovzq  24+P, r3
+
+#define muxload4(r0,r1,r2,r3,P0,P1,P2)          \
+        movq    P0, r0 ;                       \
+        cmovbq  P1, r0 ;                       \
+        cmovnbe P2, r0 ;                       \
+        movq    8+P0, r1 ;                     \
+        cmovbq  8+P1, r1 ;                     \
+        cmovnbe 8+P2, r1 ;                     \
+        movq    16+P0, r2 ;                    \
+        cmovbq  16+P1, r2 ;                    \
+        cmovnbe 16+P2, r2 ;                    \
+        movq    24+P0, r3 ;                    \
+        cmovbq  24+P1, r3 ;                    \
+        cmovnbe 24+P2, r3
+
+S2N_BN_SYMBOL(sm2_montjadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it lasts throughout the main code.
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+// 12 * multiply + 4 * square + 7 * subtract
+
+        amontsqr_sm2(z1sq,z_1)
+        amontsqr_sm2(z2sq,z_2)
+
+        montmul_sm2(y1a,z_2,y_1)
+        montmul_sm2(y2a,z_1,y_2)
+
+        montmul_sm2(x2a,z1sq,x_2)
+        montmul_sm2(x1a,z2sq,x_1)
+        montmul_sm2(y2a,z1sq,y2a)
+        montmul_sm2(y1a,z2sq,y1a)
+
+        sub_sm2(xd,x2a,x1a)
+        sub_sm2(yd,y2a,y1a)
+
+        amontsqr_sm2(zz,xd)
+        montsqr_sm2(ww,yd)
+
+        montmul_sm2(zzx1,zz,x1a)
+        montmul_sm2(zzx2,zz,x2a)
+
+        sub_sm2(resx,ww,zzx1)
+        sub_sm2(t1,zzx2,zzx1)
+
+        montmul_sm2(xd,xd,z_1)
+
+        sub_sm2(resx,resx,zzx2)
+
+        sub_sm2(t2,zzx1,resx)
+
+        montmul_sm2(t1,t1,y1a)
+
+        montmul_sm2(resz,xd,z_2)
+        montmul_sm2(t2,yd,t2)
+
+        sub_sm2(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0)
+// and "B"  <=> CF          <=> ~(P1 = 0) /\ P2 = 0
+// and "Z"  <=> ZF          <=> (P1 = 0 <=> P2 = 0)
+
+        load4(%r8,%r9,%r10,%r11,z_1)
+
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+
+        load4(%r12,%r13,%r14,%r15,z_2)
+
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+
+        cmpq    %rax, %rbx
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+
+        czload4(%r12,%r13,%r14,%r15,resz)
+
+        muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2)
+        muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2)
+
+// Finally store back the multiplexed values
+
+        store4(x_3,%rax,%rbx,%rcx,%rdx)
+        store4(y_3,%r8,%r9,%r10,%r11)
+        store4(z_3,%r12,%r13,%r14,%r15)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd_alt.S
new file mode 100644
index 00000000000..6e91054a0ad
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd_alt.S
@@ -0,0 +1,559 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjadd_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// which needs to be set up explicitly before use.
+// By design, none of the code macros modify any of
+// these, so we maintain the assignments throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+#define z_2 (2*NUMSIZE)(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z1sq (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define x1a (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define z2sq (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define y1a (NUMSIZE*6)(%rsp)
+
+#define NSPACE (NUMSIZE*7)
+// Corresponds to bignum_montmul_sm2_alt except for registers
+
+#define montmul_sm2(P0,P1,P2)                   \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    %r11, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds to bignum_montsqr_sm2_alt except for registers
+
+#define montsqr_sm2(P0,P1)                      \
+        movq    P1, %rax ;                      \
+        movq    %rax, %rbx ;                       \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r15 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        movq    %rax, %r9 ;                        \
+        movq    %rdx, %r10 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        movq    %rax, %r13 ;                       \
+        mulq    %rbx;                            \
+        movq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        movq    %rax, %rbx ;                       \
+        mulq    %r13;                            \
+        movq    %rax, %r13 ;                       \
+        movq    %rdx, %r14 ;                       \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x18+P1, %rbx ;                 \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r9, %r9 ;                         \
+        adcq    %r10, %r10 ;                       \
+        adcq    %r11, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        adcq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %r15, %r9 ;                        \
+        adcq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r14 ;                       \
+        adcq    %rcx, %rdx ;                       \
+        movq    %rdx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        movq    P1, %rax ;                      \
+        subq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        sbbq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        sbbq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        sbbq    0x18+P2, %r9 ;                  \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r11, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        movq    %r11, %rdx ;                       \
+        btr     $0x20, %rdx ;                      \
+        addq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        adcq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        adcq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        adcq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+#define czload4(r0,r1,r2,r3,P)                  \
+        cmovzq  P, r0 ;                        \
+        cmovzq  8+P, r1 ;                      \
+        cmovzq  16+P, r2 ;                     \
+        cmovzq  24+P, r3
+
+#define muxload4(r0,r1,r2,r3,P0,P1,P2)          \
+        movq    P0, r0 ;                       \
+        cmovbq  P1, r0 ;                       \
+        cmovnbe P2, r0 ;                       \
+        movq    8+P0, r1 ;                     \
+        cmovbq  8+P1, r1 ;                     \
+        cmovnbe 8+P2, r1 ;                     \
+        movq    16+P0, r2 ;                    \
+        cmovbq  16+P1, r2 ;                    \
+        cmovnbe 16+P2, r2 ;                    \
+        movq    24+P0, r3 ;                    \
+        cmovbq  24+P1, r3 ;                    \
+        cmovnbe 24+P2, r3
+
+S2N_BN_SYMBOL(sm2_montjadd_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it lasts throughout the main code.
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+// 12 * multiply + 4 * square + 7 * subtract
+
+        montsqr_sm2(z1sq,z_1)
+        montsqr_sm2(z2sq,z_2)
+
+        montmul_sm2(y1a,z_2,y_1)
+        montmul_sm2(y2a,z_1,y_2)
+
+        montmul_sm2(x2a,z1sq,x_2)
+        montmul_sm2(x1a,z2sq,x_1)
+        montmul_sm2(y2a,z1sq,y2a)
+        montmul_sm2(y1a,z2sq,y1a)
+
+        sub_sm2(xd,x2a,x1a)
+        sub_sm2(yd,y2a,y1a)
+
+        montsqr_sm2(zz,xd)
+        montsqr_sm2(ww,yd)
+
+        montmul_sm2(zzx1,zz,x1a)
+        montmul_sm2(zzx2,zz,x2a)
+
+        sub_sm2(resx,ww,zzx1)
+        sub_sm2(t1,zzx2,zzx1)
+
+        montmul_sm2(xd,xd,z_1)
+
+        sub_sm2(resx,resx,zzx2)
+
+        sub_sm2(t2,zzx1,resx)
+
+        montmul_sm2(t1,t1,y1a)
+
+        montmul_sm2(resz,xd,z_2)
+        montmul_sm2(t2,yd,t2)
+
+        sub_sm2(resy,t2,t1)
+
+// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0
+// The condition codes get set by a comparison (P2 != 0) - (P1 != 0)
+// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0)
+// and "B"  <=> CF          <=> ~(P1 = 0) /\ P2 = 0
+// and "Z"  <=> ZF          <=> (P1 = 0 <=> P2 = 0)
+
+        load4(%r8,%r9,%r10,%r11,z_1)
+
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+
+        load4(%r12,%r13,%r14,%r15,z_2)
+
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+
+        cmpq    %rax, %rbx
+
+// Multiplex the outputs accordingly, re-using the z's in registers
+
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+
+        czload4(%r12,%r13,%r14,%r15,resz)
+
+        muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2)
+        muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2)
+
+// Finally store back the multiplexed values
+
+        store4(x_3,%rax,%rbx,%rcx,%rdx)
+        store4(y_3,%r8,%r9,%r10,%r11)
+        store4(z_3,%r12,%r13,%r14,%r15)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble.S
new file mode 100644
index 00000000000..33e1cb1a46e
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble.S
@@ -0,0 +1,648 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjdouble
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjdouble)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjdouble)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1, which is true when the
+// arguments come in initially and is not disturbed throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z2 (NUMSIZE*0)(%rsp)
+#define y4 (NUMSIZE*0)(%rsp)
+
+#define y2 (NUMSIZE*1)(%rsp)
+
+#define t1 (NUMSIZE*2)(%rsp)
+
+#define t2 (NUMSIZE*3)(%rsp)
+#define x2p (NUMSIZE*3)(%rsp)
+#define dx2 (NUMSIZE*3)(%rsp)
+
+#define xy2 (NUMSIZE*4)(%rsp)
+
+#define x4p (NUMSIZE*5)(%rsp)
+#define d (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_montmul_sm2 except for registers
+
+#define montmul_sm2(P0,P1,P2)                   \
+        xorl    %ecx, %ecx ;                       \
+        movq    P2, %rdx ;                      \
+        mulxq   P1, %r8, %r9 ;                   \
+        mulxq   0x8+P1, %rax, %r10 ;             \
+        addq    %rax, %r9 ;                        \
+        mulxq   0x10+P1, %rax, %r11 ;            \
+        adcq    %rax, %r10 ;                       \
+        mulxq   0x18+P1, %rax, %r12 ;            \
+        adcq    %rax, %r11 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        movq    0x8+P2, %rdx ;                  \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rbx, %r10 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x10+P1, %rax, %rbx ;            \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        mulxq   0x18+P1, %rax, %r13 ;            \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rcx, %r13 ;                       \
+        adcxq   %rcx, %r13 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        movq    0x10+P2, %rdx ;                 \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        mulxq   0x10+P1, %rax, %rbx ;            \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        mulxq   0x18+P1, %rax, %r14 ;            \
+        adcxq   %rax, %r13 ;                       \
+        adoxq   %rcx, %r14 ;                       \
+        adcxq   %rcx, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        movq    0x18+P2, %rdx ;                 \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        mulxq   0x10+P1, %rax, %rbx ;            \
+        adcxq   %rax, %r13 ;                       \
+        adoxq   %rbx, %r14 ;                       \
+        mulxq   0x18+P1, %rax, %r15 ;            \
+        adcxq   %rax, %r14 ;                       \
+        adoxq   %rcx, %r15 ;                       \
+        adcxq   %rcx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds to bignum_montsqr_sm2 except for registers
+
+#define montsqr_sm2(P0,P1)                      \
+        movq    P1, %rdx ;                      \
+        mulxq   %rdx, %r8, %r15 ;                   \
+        mulxq   0x8+P1, %r9, %r10 ;              \
+        mulxq   0x18+P1, %r11, %r12 ;            \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   0x18+P1, %r13, %r14 ;            \
+        xorl    %ecx, %ecx ;                       \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        adcxq   %rcx, %r13 ;                       \
+        adoxq   %rcx, %r14 ;                       \
+        adcq    %rcx, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        adcxq   %r9, %r9 ;                         \
+        adoxq   %r15, %r9 ;                        \
+        movq    0x8+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r10, %r10 ;                       \
+        adoxq   %rax, %r10 ;                       \
+        adcxq   %r11, %r11 ;                       \
+        adoxq   %rdx, %r11 ;                       \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r12, %r12 ;                       \
+        adoxq   %rax, %r12 ;                       \
+        adcxq   %r13, %r13 ;                       \
+        adoxq   %rdx, %r13 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %r15 ;                  \
+        adcxq   %r14, %r14 ;                       \
+        adoxq   %rax, %r14 ;                       \
+        adcxq   %rcx, %r15 ;                       \
+        adoxq   %rcx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        movq    P1, %rax ;                      \
+        subq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        sbbq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        sbbq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        sbbq    0x18+P2, %r9 ;                  \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r11, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        movq    %r11, %rdx ;                       \
+        btr     $0x20, %rdx ;                      \
+        addq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        adcq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        adcq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        adcq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// Corresponds exactly to bignum_add_sm2
+
+#define add_sm2(P0,P1,P2)                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        adcq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        adcq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        adcq    0x18+P2, %r9 ;                  \
+        adcq    %r11, %r11 ;                       \
+        subq    $0xffffffffffffffff, %rax ;        \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r10, %rcx ;                       \
+        sbbq    $0xffffffffffffffff, %r8 ;         \
+        movq    $0xfffffffeffffffff, %rdx ;        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    $0x0, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        andq    %r11, %rdx ;                       \
+        addq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        adcq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        adcq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        adcq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// A weak version of add that only guarantees sum in 4 digits
+
+#define weakadd_sm2(P0,P1,P2)                   \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        adcq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        adcq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        adcq    0x18+P2, %r9 ;                  \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r11, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        movq    %r11, %rdx ;                       \
+        btr     $0x20, %rdx ;                      \
+        subq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        sbbq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        sbbq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        sbbq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// P0 = C * P1 - D * P2  computed as d * (p_sm2 - P2) + c * P1
+// Quotient estimation is done just as q = h + 1 as in bignum_triple_sm2
+// This also applies to the other functions following.
+
+#define cmsub_sm2(P0,C,P1,D,P2)                \
+        /* First (%r11;%r10;%r9;%r8) = p_sm2 - P2 */ \
+        movq    $0xffffffffffffffff, %r8 ;          \
+        movq    %r8, %r10 ;                        \
+        subq    P2, %r8 ;                        \
+        movq    $0xffffffff00000000, %r9 ;          \
+        sbbq    0x8+P2, %r9 ;                    \
+        sbbq    0x10+P2, %r10 ;                  \
+        movq    $0xfffffffeffffffff, %r11 ;         \
+        sbbq    0x18+P2, %r11 ;                  \
+        /* (%r12;%r11;%r10;%r9;%r8) = D * (p_sm2 - P2) */  \
+        xorl    %r12d, %r12d ;                      \
+        movq    $D, %rdx ;                         \
+        mulxq   %r8, %r8, %rax ;                    \
+        mulxq   %r9, %r9, %rcx ;                    \
+        addq    %rax, %r9 ;                        \
+        mulxq   %r10, %r10, %rax ;                  \
+        adcq    %rcx, %r10 ;                       \
+        mulxq   %r11, %r11, %rcx ;                  \
+        adcq    %rax, %r11 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + C * P1 + D * (p_sm2 - P2) */ \
+        movq    $C, %rdx ;                         \
+        xorl    %eax, %eax ;                       \
+        mulxq   P1, %rax, %rcx ;                 \
+        adcxq   %rax, %r8 ;                        \
+        adoxq   %rcx, %r9 ;                        \
+        mulxq   0x8+P1, %rax, %rcx ;             \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rcx, %r10 ;                       \
+        mulxq   0x10+P1, %rax, %rcx ;            \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rcx, %r11 ;                       \
+        mulxq   0x18+P1, %rax, %rdx ;            \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %r12, %rdx ;                       \
+        adcq    $1, %rdx ;                         \
+        /* Now the tail for modular reduction from tripling */ \
+        movq    %rdx, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %rax, %rcx ;                       \
+        subq    %rdx, %rax ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rax, %r9 ;                        \
+        adcq    $0x0, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %rdx ;                       \
+        notq    %rdx;                            \
+        movq    $0xffffffff00000000, %rax ;        \
+        andq    %rdx, %rax ;                       \
+        movq    %rdx, %rcx ;                       \
+        btr     $0x20, %rcx ;                      \
+        addq    %rdx, %r8 ;                        \
+        movq    %r8, P0 ;                       \
+        adcq    %rax, %r9 ;                        \
+        movq    %r9, 0x8+P0 ;                   \
+        adcq    %rdx, %r10 ;                       \
+        movq    %r10, 0x10+P0 ;                 \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r11, 0x18+P0
+
+// P0 = 3 * P1 - 8 * P2, computed as (p_sm2 - P2) << 3 + 3 * P1
+
+#define cmsub38_sm2(P0,P1,P2)                  \
+        /* First (%r11;%r10;%r9;%r8) = p_sm2 - P2 */ \
+        movq    $0xffffffffffffffff, %r8 ;          \
+        movq    %r8, %r10 ;                        \
+        subq    P2, %r8 ;                        \
+        movq    $0xffffffff00000000, %r9 ;          \
+        sbbq    0x8+P2, %r9 ;                    \
+        sbbq    0x10+P2, %r10 ;                  \
+        movq    $0xfffffffeffffffff, %r11 ;         \
+        sbbq    0x18+P2, %r11 ;                  \
+        /* (%r12;%r11;%r10;%r9;%r8) = (p_sm2 - P2) << 3 */  \
+        movq    %r11, %r12 ;                       \
+        shldq   $3, %r10, %r11 ;                    \
+        shldq   $3, %r9, %r10 ;                     \
+        shldq   $3, %r8, %r9 ;                      \
+        shlq    $3, %r8 ;                          \
+        shrq    $61, %r12 ;                        \
+        /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + 3 * P1 + 8 * (p_sm2 - P2) */ \
+        movq    $3, %rdx ;                         \
+        xorl    %eax, %eax ;                       \
+        mulxq   P1, %rax, %rcx ;                 \
+        adcxq   %rax, %r8 ;                        \
+        adoxq   %rcx, %r9 ;                        \
+        mulxq   0x8+P1, %rax, %rcx ;             \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rcx, %r10 ;                       \
+        mulxq   0x10+P1, %rax, %rcx ;            \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rcx, %r11 ;                       \
+        mulxq   0x18+P1, %rax, %rdx ;            \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %r12, %rdx ;                       \
+        adcq    $1, %rdx ;                         \
+        /* Now the tail for modular reduction from tripling */ \
+        movq    %rdx, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %rax, %rcx ;                       \
+        subq    %rdx, %rax ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rax, %r9 ;                        \
+        adcq    $0x0, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %rdx ;                       \
+        notq    %rdx;                            \
+        movq    $0xffffffff00000000, %rax ;        \
+        andq    %rdx, %rax ;                       \
+        movq    %rdx, %rcx ;                       \
+        btr     $0x20, %rcx ;                      \
+        addq    %rdx, %r8 ;                        \
+        movq    %r8, P0 ;                       \
+        adcq    %rax, %r9 ;                        \
+        movq    %r9, 0x8+P0 ;                   \
+        adcq    %rdx, %r10 ;                       \
+        movq    %r10, 0x10+P0 ;                 \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r11, 0x18+P0
+
+// P0 = 4 * P1 - P2, by direct subtraction of P2,
+// since the quotient estimate still works safely
+// for initial value > -p_sm2
+
+#define cmsub41_sm2(P0,P1,P2)                  \
+        movq    0x18+P1, %r11 ;                  \
+        movq    %r11, %rdx ;                       \
+        movq    0x10+P1, %r10 ;                  \
+        shldq   $2, %r10, %r11 ;                    \
+        movq    0x8+P1, %r9 ;                    \
+        shldq   $2, %r9, %r10 ;                     \
+        movq    P1, %r8 ;                        \
+        shldq   $2, %r8, %r9 ;                      \
+        shlq    $2, %r8 ;                          \
+        shrq    $62, %rdx ;                        \
+        addq    $1, %rdx ;                         \
+        subq    P2, %r8 ;                       \
+        sbbq    0x8+P2, %r9 ;                   \
+        sbbq    0x10+P2, %r10 ;                 \
+        sbbq    0x18+P2, %r11 ;                 \
+        sbbq    $0, %rdx ;                         \
+        /* Now the tail for modular reduction from tripling */ \
+        movq    %rdx, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %rax, %rcx ;                       \
+        subq    %rdx, %rax ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rax, %r9 ;                        \
+        adcq    $0x0, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %rdx ;                       \
+        notq    %rdx;                            \
+        movq    $0xffffffff00000000, %rax ;        \
+        andq    %rdx, %rax ;                       \
+        movq    %rdx, %rcx ;                       \
+        btr     $0x20, %rcx ;                      \
+        addq    %rdx, %r8 ;                        \
+        movq    %r8, P0 ;                       \
+        adcq    %rax, %r9 ;                        \
+        movq    %r9, 0x8+P0 ;                   \
+        adcq    %rdx, %r10 ;                       \
+        movq    %r10, 0x10+P0 ;                 \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(sm2_montjdouble):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room on stack for temporary variables
+
+        pushq  %rbx
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+// Main code, just a sequence of basic field operations
+// z2 = z^2
+// y2 = y^2
+
+        montsqr_sm2(z2,z_1)
+        montsqr_sm2(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        sub_sm2(t2,x_1,z2)
+        weakadd_sm2(t1,x_1,z2)
+        montmul_sm2(x2p,t1,t2)
+
+// t1 = y + z
+// xy2 = x * y^2
+// x4p = x2p^2
+
+        add_sm2(t1,y_1,z_1)
+        montmul_sm2(xy2,x_1,y2)
+        montsqr_sm2(x4p,x2p)
+
+// t1 = (y + z)^2
+
+        montsqr_sm2(t1,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_sm2(d,12,xy2,9,x4p)
+        sub_sm2(t1,t1,z2)
+
+// y4 = y^4
+
+        montsqr_sm2(y4,y2)
+
+// dx2 = d * x2p
+
+        montmul_sm2(dx2,d,x2p)
+
+// z_3' = 2 * y * z
+
+        sub_sm2(z_3,t1,y2)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_sm2(x_3,xy2,d)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_sm2(y_3,dx2,y4)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble_alt.S
new file mode 100644
index 00000000000..d7d33851c73
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble_alt.S
@@ -0,0 +1,727 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point doubling on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjdouble_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12]);
+//
+// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1
+// Microsoft x64 ABI:   RCX = p3, RDX = p1
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjdouble_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjdouble_alt)
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1, which is true when the
+// arguments come in initially and is not disturbed throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define z2 (NUMSIZE*0)(%rsp)
+#define y4 (NUMSIZE*0)(%rsp)
+
+#define y2 (NUMSIZE*1)(%rsp)
+
+#define t1 (NUMSIZE*2)(%rsp)
+
+#define t2 (NUMSIZE*3)(%rsp)
+#define x2p (NUMSIZE*3)(%rsp)
+#define dx2 (NUMSIZE*3)(%rsp)
+
+#define xy2 (NUMSIZE*4)(%rsp)
+
+#define x4p (NUMSIZE*5)(%rsp)
+#define d (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_montmul_sm2_alt except for registers
+
+#define montmul_sm2(P0,P1,P2)                   \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    %r11, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds to bignum_montsqr_sm2_alt except for registers
+
+#define montsqr_sm2(P0,P1)                      \
+        movq    P1, %rax ;                      \
+        movq    %rax, %rbx ;                       \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r15 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        movq    %rax, %r9 ;                        \
+        movq    %rdx, %r10 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        movq    %rax, %r13 ;                       \
+        mulq    %rbx;                            \
+        movq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        movq    %rax, %rbx ;                       \
+        mulq    %r13;                            \
+        movq    %rax, %r13 ;                       \
+        movq    %rdx, %r14 ;                       \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x18+P1, %rbx ;                 \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r9, %r9 ;                         \
+        adcq    %r10, %r10 ;                       \
+        adcq    %r11, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        adcq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %r15, %r9 ;                        \
+        adcq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r14 ;                       \
+        adcq    %rcx, %rdx ;                       \
+        movq    %rdx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        movq    P1, %rax ;                      \
+        subq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        sbbq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        sbbq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        sbbq    0x18+P2, %r9 ;                  \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r11, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        movq    %r11, %rdx ;                       \
+        btr     $0x20, %rdx ;                      \
+        addq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        adcq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        adcq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        adcq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// Corresponds exactly to bignum_add_sm2
+
+#define add_sm2(P0,P1,P2)                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        adcq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        adcq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        adcq    0x18+P2, %r9 ;                  \
+        adcq    %r11, %r11 ;                       \
+        subq    $0xffffffffffffffff, %rax ;        \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r10, %rcx ;                       \
+        sbbq    $0xffffffffffffffff, %r8 ;         \
+        movq    $0xfffffffeffffffff, %rdx ;        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    $0x0, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        andq    %r11, %rdx ;                       \
+        addq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        adcq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        adcq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        adcq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// A weak version of add that only guarantees sum in 4 digits
+
+#define weakadd_sm2(P0,P1,P2)                   \
+        movq    P1, %rax ;                      \
+        addq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        adcq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        adcq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        adcq    0x18+P2, %r9 ;                  \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r11, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        movq    %r11, %rdx ;                       \
+        btr     $0x20, %rdx ;                      \
+        subq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        sbbq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        sbbq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        sbbq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// P0 = C * P1 - D * P2  computed as d * (p_sm2 - P2) + c * P1
+// Quotient estimation is done just as q = h + 1 as in bignum_triple_sm2
+// This also applies to the other functions following.
+
+#define cmsub_sm2(P0,C,P1,D,P2)                \
+        /* First (%r12;%r11;%r10;%r9) = p_sm2 - P2 */ \
+        movq    $0xffffffffffffffff, %r9 ;          \
+        movq    %r9, %r11 ;                        \
+        subq    P2, %r9 ;                        \
+        movq    $0xffffffff00000000, %r10 ;         \
+        sbbq    0x8+P2, %r10 ;                   \
+        sbbq    0x10+P2, %r11 ;                  \
+        movq    $0xfffffffeffffffff, %r12 ;         \
+        sbbq    0x18+P2, %r12 ;                  \
+        /* (%r12;%r11;%r10;%r9;%r8) = D * (p_sm2 - P2) */  \
+        movq    $D, %rcx ;                         \
+        movq    %r9, %rax ;                        \
+        mulq    %rcx;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        xorl    %r10d, %r10d ;                     \
+        mulq    %rcx;                            \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        xorl    %r11d, %r11d ;                     \
+        mulq    %rcx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        movq    %r12, %rax ;                       \
+        xorl    %r12d, %r12d ;                     \
+        mulq    %rcx;                            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + C * P1 + D * (p_sm2 - P2) */ \
+        movl    $C, %ecx ;                         \
+        movq    P1, %rax ;                      \
+        mulq    %rcx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        leaq    1(%r12), %rdx ;                   \
+        /* Now the tail for modular reduction from tripling */ \
+        movq    %rdx, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %rax, %rcx ;                       \
+        subq    %rdx, %rax ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rax, %r9 ;                        \
+        adcq    $0x0, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %rdx ;                       \
+        notq    %rdx;                            \
+        movq    $0xffffffff00000000, %rax ;        \
+        andq    %rdx, %rax ;                       \
+        movq    %rdx, %rcx ;                       \
+        btr     $0x20, %rcx ;                      \
+        addq    %rdx, %r8 ;                        \
+        movq    %r8, P0 ;                       \
+        adcq    %rax, %r9 ;                        \
+        movq    %r9, 0x8+P0 ;                   \
+        adcq    %rdx, %r10 ;                       \
+        movq    %r10, 0x10+P0 ;                 \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r11, 0x18+P0
+
+// P0 = 3 * P1 - 8 * P2, computed as (p_sm2 - P2) << 3 + 3 * P1
+
+#define cmsub38_sm2(P0,P1,P2)                  \
+        /* First (%r11;%r10;%r9;%r8) = p_sm2 - P2 */ \
+        movq    $0xffffffffffffffff, %r8 ;          \
+        movq    %r8, %r10 ;                        \
+        subq    P2, %r8 ;                        \
+        movq    $0xffffffff00000000, %r9 ;          \
+        sbbq    0x8+P2, %r9 ;                    \
+        sbbq    0x10+P2, %r10 ;                  \
+        movq    $0xfffffffeffffffff, %r11 ;         \
+        sbbq    0x18+P2, %r11 ;                  \
+        /* (%r12;%r11;%r10;%r9;%r8) = (p_sm2 - P2) << 3 */  \
+        movq    %r11, %r12 ;                       \
+        shldq   $3, %r10, %r11 ;                    \
+        shldq   $3, %r9, %r10 ;                     \
+        shldq   $3, %r8, %r9 ;                      \
+        shlq    $3, %r8 ;                          \
+        shrq    $61, %r12 ;                        \
+        /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + 3 * P1 + 8 * (p_sm2 - P2) */ \
+        movl    $3, %ecx ;                         \
+        movq    P1, %rax ;                      \
+        mulq    %rcx;                            \
+        addq    %rax, %r8 ;                        \
+        adcq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %rbx ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rcx;                            \
+        subq    %rbx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        leaq    1(%r12), %rdx ;                   \
+        /* Now the tail for modular reduction from tripling */ \
+        movq    %rdx, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %rax, %rcx ;                       \
+        subq    %rdx, %rax ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rax, %r9 ;                        \
+        adcq    $0x0, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %rdx ;                       \
+        notq    %rdx;                            \
+        movq    $0xffffffff00000000, %rax ;        \
+        andq    %rdx, %rax ;                       \
+        movq    %rdx, %rcx ;                       \
+        btr     $0x20, %rcx ;                      \
+        addq    %rdx, %r8 ;                        \
+        movq    %r8, P0 ;                       \
+        adcq    %rax, %r9 ;                        \
+        movq    %r9, 0x8+P0 ;                   \
+        adcq    %rdx, %r10 ;                       \
+        movq    %r10, 0x10+P0 ;                 \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r11, 0x18+P0
+
+// P0 = 4 * P1 - P2, by direct subtraction of P2,
+// since the quotient estimate still works safely
+// for initial value > -p_sm2
+
+#define cmsub41_sm2(P0,P1,P2)                  \
+        movq    0x18+P1, %r11 ;                  \
+        movq    %r11, %rdx ;                       \
+        movq    0x10+P1, %r10 ;                  \
+        shldq   $2, %r10, %r11 ;                    \
+        movq    0x8+P1, %r9 ;                    \
+        shldq   $2, %r9, %r10 ;                     \
+        movq    P1, %r8 ;                        \
+        shldq   $2, %r8, %r9 ;                      \
+        shlq    $2, %r8 ;                          \
+        shrq    $62, %rdx ;                        \
+        addq    $1, %rdx ;                         \
+        subq    P2, %r8 ;                       \
+        sbbq    0x8+P2, %r9 ;                   \
+        sbbq    0x10+P2, %r10 ;                 \
+        sbbq    0x18+P2, %r11 ;                 \
+        sbbq    $0, %rdx ;                         \
+        /* Now the tail for modular reduction from tripling */ \
+        movq    %rdx, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %rax, %rcx ;                       \
+        subq    %rdx, %rax ;                       \
+        addq    %rdx, %r8 ;                        \
+        adcq    %rax, %r9 ;                        \
+        adcq    $0x0, %r10 ;                       \
+        adcq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %rdx ;                       \
+        notq    %rdx;                            \
+        movq    $0xffffffff00000000, %rax ;        \
+        andq    %rdx, %rax ;                       \
+        movq    %rdx, %rcx ;                       \
+        btr     $0x20, %rcx ;                      \
+        addq    %rdx, %r8 ;                        \
+        movq    %r8, P0 ;                       \
+        adcq    %rax, %r9 ;                        \
+        movq    %r9, 0x8+P0 ;                   \
+        adcq    %rdx, %r10 ;                       \
+        movq    %r10, 0x10+P0 ;                 \
+        adcq    %rcx, %r11 ;                       \
+        movq    %r11, 0x18+P0
+
+S2N_BN_SYMBOL(sm2_montjdouble_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+#endif
+
+// Save registers and make room on stack for temporary variables
+
+        pushq  %rbx
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+// Main code, just a sequence of basic field operations
+
+// z2 = z^2
+// y2 = y^2
+
+        montsqr_sm2(z2,z_1)
+        montsqr_sm2(y2,y_1)
+
+// x2p = x^2 - z^4 = (x + z^2) * (x - z^2)
+
+        sub_sm2(t2,x_1,z2)
+        weakadd_sm2(t1,x_1,z2)
+        montmul_sm2(x2p,t1,t2)
+
+// t1 = y + z
+// xy2 = x * y^2
+// x4p = x2p^2
+
+        add_sm2(t1,y_1,z_1)
+        montmul_sm2(xy2,x_1,y2)
+        montsqr_sm2(x4p,x2p)
+
+// t1 = (y + z)^2
+
+        montsqr_sm2(t1,t1)
+
+// d = 12 * xy2 - 9 * x4p
+// t1 = y^2 + 2 * y * z
+
+        cmsub_sm2(d,12,xy2,9,x4p)
+        sub_sm2(t1,t1,z2)
+
+// y4 = y^4
+
+        montsqr_sm2(y4,y2)
+
+// dx2 = d * x2p
+
+        montmul_sm2(dx2,d,x2p)
+
+// z_3' = 2 * y * z
+
+        sub_sm2(z_3,t1,y2)
+
+// x' = 4 * xy2 - d
+
+        cmsub41_sm2(x_3,xy2,d)
+
+// y' = 3 * dx2 - 8 * y4
+
+        cmsub38_sm2(y_3,dx2,y4)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd.S
new file mode 100644
index 00000000000..48de1f997d1
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd.S
@@ -0,0 +1,594 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjmixadd
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjmixadd)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjmixadd)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// which needs to be set up explicitly before use.
+// By design, none of the code macros modify any of
+// these, so we maintain the assignments throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_montmul_sm2 except for registers
+
+#define montmul_sm2(P0,P1,P2)                   \
+        xorl    %ecx, %ecx ;                       \
+        movq    P2, %rdx ;                      \
+        mulxq   P1, %r8, %r9 ;                   \
+        mulxq   0x8+P1, %rax, %r10 ;             \
+        addq    %rax, %r9 ;                        \
+        mulxq   0x10+P1, %rax, %r11 ;            \
+        adcq    %rax, %r10 ;                       \
+        mulxq   0x18+P1, %rax, %r12 ;            \
+        adcq    %rax, %r11 ;                       \
+        adcq    %rcx, %r12 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        movq    0x8+P2, %rdx ;                  \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r9 ;                        \
+        adoxq   %rbx, %r10 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x10+P1, %rax, %rbx ;            \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        mulxq   0x18+P1, %rax, %r13 ;            \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rcx, %r13 ;                       \
+        adcxq   %rcx, %r13 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        movq    0x10+P2, %rdx ;                 \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        mulxq   0x10+P1, %rax, %rbx ;            \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        mulxq   0x18+P1, %rax, %r14 ;            \
+        adcxq   %rax, %r13 ;                       \
+        adoxq   %rcx, %r14 ;                       \
+        adcxq   %rcx, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        movq    0x18+P2, %rdx ;                 \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        mulxq   0x10+P1, %rax, %rbx ;            \
+        adcxq   %rax, %r13 ;                       \
+        adoxq   %rbx, %r14 ;                       \
+        mulxq   0x18+P1, %rax, %r15 ;            \
+        adcxq   %rax, %r14 ;                       \
+        adoxq   %rcx, %r15 ;                       \
+        adcxq   %rcx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds to bignum_montsqr_sm2 except for registers
+
+#define montsqr_sm2(P0,P1)                      \
+        movq    P1, %rdx ;                      \
+        mulxq   %rdx, %r8, %r15 ;                   \
+        mulxq   0x8+P1, %r9, %r10 ;              \
+        mulxq   0x18+P1, %r11, %r12 ;            \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   0x18+P1, %r13, %r14 ;            \
+        xorl    %ecx, %ecx ;                       \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        adcxq   %rcx, %r13 ;                       \
+        adoxq   %rcx, %r14 ;                       \
+        adcq    %rcx, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        adcxq   %r9, %r9 ;                         \
+        adoxq   %r15, %r9 ;                        \
+        movq    0x8+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r10, %r10 ;                       \
+        adoxq   %rax, %r10 ;                       \
+        adcxq   %r11, %r11 ;                       \
+        adoxq   %rdx, %r11 ;                       \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r12, %r12 ;                       \
+        adoxq   %rax, %r12 ;                       \
+        adcxq   %r13, %r13 ;                       \
+        adoxq   %rdx, %r13 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %r15 ;                  \
+        adcxq   %r14, %r14 ;                       \
+        adoxq   %rax, %r14 ;                       \
+        adcxq   %rcx, %r15 ;                       \
+        adoxq   %rcx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Almost-Montgomery variant which we use when an input to other muls
+// with the other argument fully reduced (which is always safe).
+
+#define amontsqr_sm2(P0,P1)                     \
+        movq    P1, %rdx ;                      \
+        mulxq   %rdx, %r8, %r15 ;                   \
+        mulxq   0x8+P1, %r9, %r10 ;              \
+        mulxq   0x18+P1, %r11, %r12 ;            \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   0x18+P1, %r13, %r14 ;            \
+        xorl    %ecx, %ecx ;                       \
+        mulxq   P1, %rax, %rbx ;                 \
+        adcxq   %rax, %r10 ;                       \
+        adoxq   %rbx, %r11 ;                       \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r11 ;                       \
+        adoxq   %rbx, %r12 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   0x8+P1, %rax, %rbx ;             \
+        adcxq   %rax, %r12 ;                       \
+        adoxq   %rbx, %r13 ;                       \
+        adcxq   %rcx, %r13 ;                       \
+        adoxq   %rcx, %r14 ;                       \
+        adcq    %rcx, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        adcxq   %r9, %r9 ;                         \
+        adoxq   %r15, %r9 ;                        \
+        movq    0x8+P1, %rdx ;                  \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r10, %r10 ;                       \
+        adoxq   %rax, %r10 ;                       \
+        adcxq   %r11, %r11 ;                       \
+        adoxq   %rdx, %r11 ;                       \
+        movq    0x10+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %rdx ;                  \
+        adcxq   %r12, %r12 ;                       \
+        adoxq   %rax, %r12 ;                       \
+        adcxq   %r13, %r13 ;                       \
+        adoxq   %rdx, %r13 ;                       \
+        movq    0x18+P1, %rdx ;                 \
+        mulxq   %rdx, %rax, %r15 ;                  \
+        adcxq   %r14, %r14 ;                       \
+        adoxq   %rax, %r14 ;                       \
+        adcxq   %rcx, %r15 ;                       \
+        adoxq   %rcx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        sbbq    %rax, %rax ;                       \
+        movq    $0xffffffff00000000, %rbx ;        \
+        movq    %rax, %rcx ;                       \
+        andq    %rax, %rbx ;                       \
+        btr     $32, %rcx ;                        \
+        subq    %rax, %r12 ;                       \
+        sbbq    %rbx, %r13 ;                       \
+        sbbq    %rax, %r14 ;                       \
+        sbbq    %rcx, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        movq    P1, %rax ;                      \
+        subq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        sbbq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        sbbq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        sbbq    0x18+P2, %r9 ;                  \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r11, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        movq    %r11, %rdx ;                       \
+        btr     $0x20, %rdx ;                      \
+        addq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        adcq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        adcq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        adcq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define testzero4(P)                            \
+        movq    P, %rax ;                       \
+        movq    8+P, %rdx ;                     \
+        orq     16+P, %rax ;                    \
+        orq     24+P, %rdx ;                    \
+        orq     %rdx, %rax
+
+#define mux4(r0,r1,r2,r3,PNE,PEQ)               \
+        movq    PNE, r0 ;                      \
+        movq    PEQ, %rax ;                     \
+        cmovzq  %rax, r0 ;                        \
+        movq    8+PNE, r1 ;                    \
+        movq    8+PEQ, %rax ;                   \
+        cmovzq  %rax, r1 ;                        \
+        movq    16+PNE, r2 ;                   \
+        movq    16+PEQ, %rax ;                  \
+        cmovzq  %rax, r2 ;                        \
+        movq    24+PNE, r3 ;                   \
+        movq    24+PEQ, %rax ;                  \
+        cmovzq  %rax, r3
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+S2N_BN_SYMBOL(sm2_montjmixadd):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it lasts throughout the main code.
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        amontsqr_sm2(zp2,z_1)
+
+        montmul_sm2(y2a,z_1,y_2)
+        montmul_sm2(x2a,zp2,x_2)
+        montmul_sm2(y2a,zp2,y2a)
+
+        sub_sm2(xd,x2a,x_1)
+
+        sub_sm2(yd,y2a,y_1)
+
+        amontsqr_sm2(zz,xd)
+        montsqr_sm2(ww,yd)
+
+        montmul_sm2(zzx1,zz,x_1)
+        montmul_sm2(zzx2,zz,x2a)
+
+        sub_sm2(resx,ww,zzx1)
+        sub_sm2(t1,zzx2,zzx1)
+
+        montmul_sm2(resz,xd,z_1)
+
+        sub_sm2(resx,resx,zzx2)
+
+        sub_sm2(t2,zzx1,resx)
+
+        montmul_sm2(t1,t1,y_1)
+        montmul_sm2(t2,yd,t2)
+
+        sub_sm2(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        testzero4(z_1)
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^256 - p_sm2),
+// hence giving 0 + p2 = p2 for the final result.
+
+        mux4(%r8,%r9,%r10,%r11,resx,x_2)
+        mux4(%r12,%r13,%r14,%r15,resy,y_2)
+
+        store4(x_3,%r8,%r9,%r10,%r11)
+        store4(y_3,%r12,%r13,%r14,%r15)
+
+        load4(%r8,%r9,%r10,%r11,resz)
+        movl    $1, %eax
+        cmovzq  %rax, %r8
+        movl    $0x00000000ffffffff, %eax
+        cmovzq  %rax, %r9
+        movl    $0, %eax
+        cmovzq  %rax, %r10
+        movq    $0x0000000100000000, %rax
+        cmovzq  %rax, %r11
+
+        store4(z_3,%r8,%r9,%r10,%r11)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd_alt.S
new file mode 100644
index 00000000000..74e5c7d45c2
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd_alt.S
@@ -0,0 +1,533 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Point mixed addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates
+//
+//    extern void sm2_montjmixadd_alt
+//      (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]);
+//
+// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with
+// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2.
+// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3).
+// The "mixed" part means that p2 only has x and y coordinates, with the
+// implicit z coordinate assumed to be the identity.
+//
+// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2
+// Microsoft x64 ABI:   RCX = p3, RDX = p1, R8 = p2
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjmixadd_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjmixadd_alt)
+        .text
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Pointer-offset pairs for inputs and outputs
+// These assume %rdi = p3, %rsi = p1 and %rbp = p2,
+// which needs to be set up explicitly before use.
+// By design, none of the code macros modify any of
+// these, so we maintain the assignments throughout.
+
+#define x_1 0(%rsi)
+#define y_1 NUMSIZE(%rsi)
+#define z_1 (2*NUMSIZE)(%rsi)
+
+#define x_2 0(%rbp)
+#define y_2 NUMSIZE(%rbp)
+
+#define x_3 0(%rdi)
+#define y_3 NUMSIZE(%rdi)
+#define z_3 (2*NUMSIZE)(%rdi)
+
+// Pointer-offset pairs for temporaries, with some aliasing
+// NSPACE is the total stack needed for these temporaries
+
+#define zp2 (NUMSIZE*0)(%rsp)
+#define ww (NUMSIZE*0)(%rsp)
+#define resx (NUMSIZE*0)(%rsp)
+
+#define yd (NUMSIZE*1)(%rsp)
+#define y2a (NUMSIZE*1)(%rsp)
+
+#define x2a (NUMSIZE*2)(%rsp)
+#define zzx2 (NUMSIZE*2)(%rsp)
+
+#define zz (NUMSIZE*3)(%rsp)
+#define t1 (NUMSIZE*3)(%rsp)
+
+#define t2 (NUMSIZE*4)(%rsp)
+#define zzx1 (NUMSIZE*4)(%rsp)
+#define resy (NUMSIZE*4)(%rsp)
+
+#define xd (NUMSIZE*5)(%rsp)
+#define resz (NUMSIZE*5)(%rsp)
+
+#define NSPACE (NUMSIZE*6)
+
+// Corresponds to bignum_montmul_sm2_alt except for registers
+
+#define montmul_sm2(P0,P1,P2)                   \
+        movq    P1, %rax ;                      \
+        mulq     P2;                 \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r9 ;                        \
+        xorq    %r10, %r10 ;                       \
+        xorq    %r11, %r11 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x8+P2;             \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     P2;                 \
+        addq    %rax, %r9 ;                        \
+        adcq    %rdx, %r10 ;                       \
+        adcq    %r11, %r11 ;                       \
+        xorq    %r12, %r12 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x10+P2;            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x8+P2;             \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        adcq    $0x0, %r12 ;                       \
+        xorq    %r13, %r13 ;                       \
+        movq    P1, %rax ;                      \
+        mulq     0x18+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x10+P2;            \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     P2;                 \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        adcq    $0x0, %r13 ;                       \
+        xorq    %r14, %r14 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq     0x18+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x8+P2;             \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x10+P2;            \
+        addq    %rax, %r13 ;                       \
+        adcq    %rdx, %r14 ;                       \
+        adcq    $0x0, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq     0x18+P2;            \
+        addq    %rax, %r14 ;                       \
+        adcq    %rdx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds to bignum_montsqr_sm2_alt except for registers
+
+#define montsqr_sm2(P0,P1)                      \
+        movq    P1, %rax ;                      \
+        movq    %rax, %rbx ;                       \
+        mulq    %rax;                            \
+        movq    %rax, %r8 ;                        \
+        movq    %rdx, %r15 ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        movq    %rax, %r9 ;                        \
+        movq    %rdx, %r10 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        movq    %rax, %r13 ;                       \
+        mulq    %rbx;                            \
+        movq    %rax, %r11 ;                       \
+        movq    %rdx, %r12 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        movq    %rax, %rbx ;                       \
+        mulq    %r13;                            \
+        movq    %rax, %r13 ;                       \
+        movq    %rdx, %r14 ;                       \
+        movq    P1, %rax ;                      \
+        mulq    %rbx;                            \
+        addq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r11 ;                       \
+        adcq    %rdx, %r12 ;                       \
+        sbbq    %rcx, %rcx ;                       \
+        movq    0x18+P1, %rbx ;                 \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rbx;                            \
+        subq    %rcx, %rdx ;                       \
+        addq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        adcq    $0x0, %r14 ;                       \
+        xorl    %ecx, %ecx ;                       \
+        addq    %r9, %r9 ;                         \
+        adcq    %r10, %r10 ;                       \
+        adcq    %r11, %r11 ;                       \
+        adcq    %r12, %r12 ;                       \
+        adcq    %r13, %r13 ;                       \
+        adcq    %r14, %r14 ;                       \
+        adcq    %rcx, %rcx ;                       \
+        movq    0x8+P1, %rax ;                  \
+        mulq    %rax;                            \
+        addq    %r15, %r9 ;                        \
+        adcq    %rax, %r10 ;                       \
+        adcq    %rdx, %r11 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x10+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r12 ;                       \
+        adcq    %rdx, %r13 ;                       \
+        sbbq    %r15, %r15 ;                       \
+        movq    0x18+P1, %rax ;                 \
+        mulq    %rax;                            \
+        negq    %r15;                            \
+        adcq    %rax, %r14 ;                       \
+        adcq    %rcx, %rdx ;                       \
+        movq    %rdx, %r15 ;                       \
+        movq    %r8, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r8, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r8, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r9 ;                        \
+        sbbq    %rcx, %r10 ;                       \
+        sbbq    %rdx, %r11 ;                       \
+        sbbq    %rbx, %r8 ;                        \
+        movq    %r9, %rax ;                        \
+        shlq    $0x20, %rax ;                      \
+        movq    %r9, %rcx ;                        \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r9, %rax ;                        \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r10 ;                       \
+        sbbq    %rcx, %r11 ;                       \
+        sbbq    %rdx, %r8 ;                        \
+        sbbq    %rbx, %r9 ;                        \
+        movq    %r10, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r10, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r10, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r11 ;                       \
+        sbbq    %rcx, %r8 ;                        \
+        sbbq    %rdx, %r9 ;                        \
+        sbbq    %rbx, %r10 ;                       \
+        movq    %r11, %rax ;                       \
+        shlq    $0x20, %rax ;                      \
+        movq    %r11, %rcx ;                       \
+        shrq    $0x20, %rcx ;                      \
+        movq    %rax, %rdx ;                       \
+        movq    %rcx, %rbx ;                       \
+        subq    %r11, %rax ;                       \
+        sbbq    $0x0, %rcx ;                       \
+        subq    %rax, %r8 ;                        \
+        sbbq    %rcx, %r9 ;                        \
+        sbbq    %rdx, %r10 ;                       \
+        sbbq    %rbx, %r11 ;                       \
+        xorl    %eax, %eax ;                       \
+        addq    %r8, %r12 ;                        \
+        adcq    %r9, %r13 ;                        \
+        adcq    %r10, %r14 ;                       \
+        adcq    %r11, %r15 ;                       \
+        adcq    %rax, %rax ;                       \
+        movl    $0x1, %ecx ;                       \
+        movl    $0xffffffff, %edx ;                \
+        xorl    %ebx, %ebx ;                       \
+        addq    %r12, %rcx ;                       \
+        leaq    0x1(%rdx), %r11 ;                 \
+        adcq    %r13, %rdx ;                       \
+        leaq    -0x1(%rbx), %r8 ;                  \
+        adcq    %r14, %rbx ;                       \
+        adcq    %r15, %r11 ;                       \
+        adcq    %rax, %r8 ;                        \
+        cmovbq  %rcx, %r12 ;                       \
+        cmovbq  %rdx, %r13 ;                       \
+        cmovbq  %rbx, %r14 ;                       \
+        cmovbq  %r11, %r15 ;                       \
+        movq    %r12, P0 ;                      \
+        movq    %r13, 0x8+P0 ;                  \
+        movq    %r14, 0x10+P0 ;                 \
+        movq    %r15, 0x18+P0
+
+// Corresponds exactly to bignum_sub_sm2
+
+#define sub_sm2(P0,P1,P2)                       \
+        movq    P1, %rax ;                      \
+        subq    P2, %rax ;                      \
+        movq    0x8+P1, %rcx ;                  \
+        sbbq    0x8+P2, %rcx ;                  \
+        movq    0x10+P1, %r8 ;                  \
+        sbbq    0x10+P2, %r8 ;                  \
+        movq    0x18+P1, %r9 ;                  \
+        sbbq    0x18+P2, %r9 ;                  \
+        movq    $0xffffffff00000000, %r10 ;        \
+        sbbq    %r11, %r11 ;                       \
+        andq    %r11, %r10 ;                       \
+        movq    %r11, %rdx ;                       \
+        btr     $0x20, %rdx ;                      \
+        addq    %r11, %rax ;                       \
+        movq    %rax, P0 ;                      \
+        adcq    %r10, %rcx ;                       \
+        movq    %rcx, 0x8+P0 ;                  \
+        adcq    %r11, %r8 ;                        \
+        movq    %r8, 0x10+P0 ;                  \
+        adcq    %rdx, %r9 ;                        \
+        movq    %r9, 0x18+P0
+
+// Additional macros to help with final multiplexing
+
+#define testzero4(P)                            \
+        movq    P, %rax ;                       \
+        movq    8+P, %rdx ;                     \
+        orq     16+P, %rax ;                    \
+        orq     24+P, %rdx ;                    \
+        orq     %rdx, %rax
+
+#define mux4(r0,r1,r2,r3,PNE,PEQ)               \
+        movq    PNE, r0 ;                      \
+        movq    PEQ, %rax ;                     \
+        cmovzq  %rax, r0 ;                        \
+        movq    8+PNE, r1 ;                    \
+        movq    8+PEQ, %rax ;                   \
+        cmovzq  %rax, r1 ;                        \
+        movq    16+PNE, r2 ;                   \
+        movq    16+PEQ, %rax ;                  \
+        cmovzq  %rax, r2 ;                        \
+        movq    24+PNE, r3 ;                   \
+        movq    24+PEQ, %rax ;                  \
+        cmovzq  %rax, r3
+
+#define load4(r0,r1,r2,r3,P)                    \
+        movq    P, r0 ;                        \
+        movq    8+P, r1 ;                      \
+        movq    16+P, r2 ;                     \
+        movq    24+P, r3
+
+#define store4(P,r0,r1,r2,r3)                   \
+        movq    r0, P ;                        \
+        movq    r1, 8+P ;                      \
+        movq    r2, 16+P ;                     \
+        movq    r3, 24+P
+
+S2N_BN_SYMBOL(sm2_montjmixadd_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+#endif
+
+// Save registers and make room on stack for temporary variables
+// Put the input y in %rbp where it lasts throughout the main code.
+
+        pushq  %rbx
+        pushq  %rbp
+        pushq  %r12
+        pushq  %r13
+        pushq  %r14
+        pushq  %r15
+
+        subq    $NSPACE, %rsp
+
+        movq    %rdx, %rbp
+
+// Main code, just a sequence of basic field operations
+// 8 * multiply + 3 * square + 7 * subtract
+
+        montsqr_sm2(zp2,z_1)
+
+        montmul_sm2(y2a,z_1,y_2)
+        montmul_sm2(x2a,zp2,x_2)
+        montmul_sm2(y2a,zp2,y2a)
+
+        sub_sm2(xd,x2a,x_1)
+
+        sub_sm2(yd,y2a,y_1)
+
+        montsqr_sm2(zz,xd)
+        montsqr_sm2(ww,yd)
+
+        montmul_sm2(zzx1,zz,x_1)
+        montmul_sm2(zzx2,zz,x2a)
+
+        sub_sm2(resx,ww,zzx1)
+        sub_sm2(t1,zzx2,zzx1)
+
+        montmul_sm2(resz,xd,z_1)
+
+        sub_sm2(resx,resx,zzx2)
+
+        sub_sm2(t2,zzx1,resx)
+
+        montmul_sm2(t1,t1,y_1)
+        montmul_sm2(t2,yd,t2)
+
+        sub_sm2(resy,t2,t1)
+
+// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence)
+
+        testzero4(z_1)
+
+// Multiplex: if p1 <> 0 just copy the computed result from the staging area.
+// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in
+// Montgomery form so not the simple constant 1 but rather 2^256 - p_sm2),
+// hence giving 0 + p2 = p2 for the final result.
+
+        mux4(%r8,%r9,%r10,%r11,resx,x_2)
+        mux4(%r12,%r13,%r14,%r15,resy,y_2)
+
+        store4(x_3,%r8,%r9,%r10,%r11)
+        store4(y_3,%r12,%r13,%r14,%r15)
+
+        load4(%r8,%r9,%r10,%r11,resz)
+        movl    $1, %eax
+        cmovzq  %rax, %r8
+        movl    $0x00000000ffffffff, %eax
+        cmovzq  %rax, %r9
+        movl    $0, %eax
+        cmovzq  %rax, %r10
+        movq    $0x0000000100000000, %rax
+        cmovzq  %rax, %r11
+
+        store4(z_3,%r8,%r9,%r10,%r11)
+
+// Restore stack and registers
+
+        addq    $NSPACE, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+
+#if WINDOWS_ABI
+        popq   %rsi
+        popq   %rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul.S
new file mode 100644
index 00000000000..de2d11d8d86
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul.S
@@ -0,0 +1,3859 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery-Jacobian form scalar multiplication for GM/T 0003-2012 curve SM2
+// Input scalar[4], point[12]; output res[12]
+//
+// extern void sm2_montjscalarmul
+//   (uint64_t res[static 12],
+//    uint64_t scalar[static 4],
+//    uint64_t point[static 12]);
+//
+// This function is a variant of its affine point version sm2_scalarmul.
+// Here, input and output points are assumed to be in Jacobian form with
+// their coordinates in the Montgomery domain. Thus, if priming indicates
+// Montgomery form, x' = (2^256 * x) mod p_sm2 etc., each point argument
+// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when
+// z' is nonzero or the point at infinity (group identity) if z' = 0.
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve SM2, returns a representation of n * P. If the result is the
+// point at infinity (either because the input point was or because the
+// scalar was a multiple of p_sm2) then the output is guaranteed to
+// represent the point at infinity, i.e. to have its z coordinate zero.
+//
+// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point
+// Microsoft x64 ABI:   RCX = res, RDX = scalar, R8 = point
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjscalarmul)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjscalarmul)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Intermediate variables on the stack. Uppercase syntactic variants
+// make x86_att version simpler to generate.
+
+#define SCALARB (0*NUMSIZE)
+#define scalarb (0*NUMSIZE)(%rsp)
+#define ACC (1*NUMSIZE)
+#define acc (1*NUMSIZE)(%rsp)
+#define TABENT (4*NUMSIZE)
+#define tabent (4*NUMSIZE)(%rsp)
+
+#define TAB (7*NUMSIZE)
+#define tab (7*NUMSIZE)(%rsp)
+
+#define res (31*NUMSIZE)(%rsp)
+
+#define NSPACE (32*NUMSIZE)
+
+// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator,
+// which doesn't accept repetitions, assembler macros etc.
+
+#define selectblock(I)                          \
+        cmpq    $I, %rdi ;                         \
+        cmovzq  TAB+96*(I-1)(%rsp), %rax ;        \
+        cmovzq  TAB+96*(I-1)+8(%rsp), %rbx ;      \
+        cmovzq  TAB+96*(I-1)+16(%rsp), %rcx ;     \
+        cmovzq  TAB+96*(I-1)+24(%rsp), %rdx ;     \
+        cmovzq  TAB+96*(I-1)+32(%rsp), %r8 ;      \
+        cmovzq  TAB+96*(I-1)+40(%rsp), %r9 ;      \
+        cmovzq  TAB+96*(I-1)+48(%rsp), %r10 ;     \
+        cmovzq  TAB+96*(I-1)+56(%rsp), %r11 ;     \
+        cmovzq  TAB+96*(I-1)+64(%rsp), %r12 ;     \
+        cmovzq  TAB+96*(I-1)+72(%rsp), %r13 ;     \
+        cmovzq  TAB+96*(I-1)+80(%rsp), %r14 ;     \
+        cmovzq  TAB+96*(I-1)+88(%rsp), %r15
+
+S2N_BN_SYMBOL(sm2_montjscalarmul):
+        _CET_ENDBR
+
+// The Windows version literally calls the standard ABI version.
+// This simplifies the proofs since subroutine offsets are fixed.
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        callq   sm2_montjscalarmul_standard
+        popq   %rsi
+        popq   %rdi
+        ret
+
+sm2_montjscalarmul_standard:
+#endif
+
+// Real start of the standard ABI code.
+
+        pushq   %r15
+        pushq   %r14
+        pushq   %r13
+        pushq   %r12
+        pushq   %rbp
+        pushq   %rbx
+
+        subq    $NSPACE, %rsp
+
+// Preserve the "res" and "point" input arguments. We load and process the
+// scalar immediately so we don't bother preserving that input argument.
+// Also, "point" is only needed early on and so its register gets re-used.
+
+        movq    %rdx, %rbx
+        movq    %rdi, res
+
+// Load the digits of group order n_sm2 = [%r15;%r14;%r13;%r12]
+
+        movq    $0x53bbf40939d54123, %r12
+        movq    $0x7203df6b21c6052b, %r13
+        movq    $0xffffffffffffffff, %r14
+        movq    $0xfffffffeffffffff, %r15
+
+// First, reduce the input scalar mod n_sm2, i.e. conditionally subtract n_sm2
+
+        movq    (%rsi), %r8
+        subq    %r12, %r8
+        movq    8(%rsi), %r9
+        sbbq    %r13, %r9
+        movq    16(%rsi), %r10
+        sbbq    %r14, %r10
+        movq    24(%rsi), %r11
+        sbbq    %r15, %r11
+
+        cmovcq  (%rsi), %r8
+        cmovcq  8(%rsi), %r9
+        cmovcq  16(%rsi), %r10
+        cmovcq  24(%rsi), %r11
+
+// Now if the top bit of the reduced scalar is set, negate it mod n_sm2,
+// i.e. do n |-> n_sm2 - n. Remember the sign in %rbp so we can
+// correspondingly negate the point below.
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        movq    %r11, %rbp
+        shrq    $63, %rbp
+        cmovnzq %r12, %r8
+        cmovnzq %r13, %r9
+        cmovnzq %r14, %r10
+        cmovnzq %r15, %r11
+
+// In either case then add the recoding constant 0x08888...888 to allow
+// signed digits.
+
+        movq    $0x8888888888888888, %rax
+        addq    %rax, %r8
+        adcq    %rax, %r9
+        adcq    %rax, %r10
+        adcq    %rax, %r11
+        btc     $63, %r11
+
+        movq    %r8, SCALARB(%rsp)
+        movq    %r9, SCALARB+8(%rsp)
+        movq    %r10, SCALARB+16(%rsp)
+        movq    %r11, SCALARB+24(%rsp)
+
+// Set the tab[0] table entry to the input point = 1 * P, except
+// that we negate it if the top bit of the scalar was set. This
+// negation takes care over the y = 0 case to maintain all the
+// coordinates < p_sm2 throughout, even though triples (x,y,z)
+// with y = 0 can only represent a point on the curve when z = 0
+// and it represents the point at infinity regardless of x and y.
+
+        movq    (%rbx), %rax
+        movq    %rax, TAB(%rsp)
+        movq    8(%rbx), %rax
+        movq    %rax, TAB+8(%rsp)
+        movq    16(%rbx), %rax
+        movq    %rax, TAB+16(%rsp)
+        movq    24(%rbx), %rax
+        movq    %rax, TAB+24(%rsp)
+
+        movq    32(%rbx), %r12
+        movq    %r12, %rax
+        movq    40(%rbx), %r13
+        orq     %r13, %rax
+        movq    48(%rbx), %r14
+        movq    %r14, %rcx
+        movq    56(%rbx), %r15
+        orq     %r15, %rcx
+        orq     %rcx, %rax
+        cmovzq  %rax, %rbp
+
+        xorl    %r11d, %r11d
+        movl    $0x00000000ffffffff, %r9d
+        notq    %r11
+        movq    %r11, %r8
+        movq    %r11, %r10
+        xorq    %r8, %r9
+        btr     $32, %r11
+
+        subq    %r12, %r8
+        sbbq    %r13, %r9
+        sbbq    %r14, %r10
+        sbbq    %r15, %r11
+        testq   %rbp, %rbp
+        cmovzq  %r12, %r8
+        cmovzq  %r13, %r9
+        cmovzq  %r14, %r10
+        cmovzq  %r15, %r11
+        movq    %r8, TAB+32(%rsp)
+        movq    %r9, TAB+40(%rsp)
+        movq    %r10, TAB+48(%rsp)
+        movq    %r11, TAB+56(%rsp)
+
+        movq    64(%rbx), %rax
+        movq    %rax, TAB+64(%rsp)
+        movq    72(%rbx), %rax
+        movq    %rax, TAB+72(%rsp)
+        movq    80(%rbx), %rax
+        movq    %rax, TAB+80(%rsp)
+        movq    88(%rbx), %rax
+        movq    %rax, TAB+88(%rsp)
+
+// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P
+
+        leaq    TAB+96*1(%rsp), %rdi
+        leaq    TAB(%rsp), %rsi
+        callq   sm2_montjscalarmul_sm2_montjdouble
+
+        leaq    TAB+96*2(%rsp), %rdi
+        leaq    TAB+96*1(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   sm2_montjscalarmul_sm2_montjadd
+
+        leaq    TAB+96*3(%rsp), %rdi
+        leaq    TAB+96*1(%rsp), %rsi
+        callq   sm2_montjscalarmul_sm2_montjdouble
+
+        leaq    TAB+96*4(%rsp), %rdi
+        leaq    TAB+96*3(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   sm2_montjscalarmul_sm2_montjadd
+
+        leaq    TAB+96*5(%rsp), %rdi
+        leaq    TAB+96*2(%rsp), %rsi
+        callq   sm2_montjscalarmul_sm2_montjdouble
+
+        leaq    TAB+96*6(%rsp), %rdi
+        leaq    TAB+96*5(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   sm2_montjscalarmul_sm2_montjadd
+
+        leaq    TAB+96*7(%rsp), %rdi
+        leaq    TAB+96*3(%rsp), %rsi
+        callq   sm2_montjscalarmul_sm2_montjdouble
+
+// Set up accumulator as table entry for top 4 bits (constant-time indexing)
+
+        movq    SCALARB+24(%rsp), %rdi
+        shrq    $60, %rdi
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        xorl    %r8d, %r8d
+        xorl    %r9d, %r9d
+        xorl    %r10d, %r10d
+        xorl    %r11d, %r11d
+        xorl    %r12d, %r12d
+        xorl    %r13d, %r13d
+        xorl    %r14d, %r14d
+        xorl    %r15d, %r15d
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+
+        movq     %rax, ACC(%rsp)
+        movq     %rbx, ACC+8(%rsp)
+        movq     %rcx, ACC+16(%rsp)
+        movq     %rdx, ACC+24(%rsp)
+        movq     %r8, ACC+32(%rsp)
+        movq     %r9, ACC+40(%rsp)
+        movq     %r10, ACC+48(%rsp)
+        movq     %r11, ACC+56(%rsp)
+        movq     %r12, ACC+64(%rsp)
+        movq     %r13, ACC+72(%rsp)
+        movq     %r14, ACC+80(%rsp)
+        movq     %r15, ACC+88(%rsp)
+
+// Main loop over size-4 bitfield
+
+        movl    $252, %ebp
+
+sm2_montjscalarmul_mainloop:
+        subq    $4, %rbp
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_sm2_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_sm2_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_sm2_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_sm2_montjdouble
+
+        movq    %rbp, %rax
+        shrq    $6, %rax
+        movq    (%rsp,%rax,8), %rdi
+        movq    %rbp, %rcx
+        shrq    %cl, %rdi
+        andq    $15, %rdi
+
+        subq    $8, %rdi
+        sbbq    %rsi, %rsi // %rsi = sign of digit (-1 = negative)
+        xorq    %rsi, %rdi
+        subq    %rsi, %rdi // %rdi = absolute value of digit
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        xorl    %r8d, %r8d
+        xorl    %r9d, %r9d
+        xorl    %r10d, %r10d
+        xorl    %r11d, %r11d
+        xorl    %r12d, %r12d
+        xorl    %r13d, %r13d
+        xorl    %r14d, %r14d
+        xorl    %r15d, %r15d
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+
+// Store it to "tabent" with the y coordinate optionally negated
+// Again, do it carefully to give coordinates < p_sm2 even in
+// the degenerate case y = 0 (when z = 0 for points on the curve).
+
+        movq     %rax, TABENT(%rsp)
+        movq     %rbx, TABENT+8(%rsp)
+        movq     %rcx, TABENT+16(%rsp)
+        movq     %rdx, TABENT+24(%rsp)
+
+        movq     %r12, TABENT+64(%rsp)
+        movq     %r13, TABENT+72(%rsp)
+        movq     %r14, TABENT+80(%rsp)
+        movq     %r15, TABENT+88(%rsp)
+
+        xorl    %r15d, %r15d
+        movq    %r8, %rax
+        movl    $0x00000000ffffffff, %r13d
+        orq     %r9, %rax
+        notq    %r15
+        movq    %r10, %rcx
+        movq    %r15, %r12
+        orq     %r11, %rcx
+        movq    %r15, %r14
+        xorq    %r12, %r13
+        btr     $32, %r15
+        orq     %rcx, %rax
+        cmovzq  %rax, %rsi
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        testq   %rsi, %rsi
+        cmovnzq  %r12, %r8
+        cmovnzq  %r13, %r9
+        cmovnzq  %r14, %r10
+        cmovnzq  %r15, %r11
+
+        movq     %r8, TABENT+32(%rsp)
+        movq     %r9, TABENT+40(%rsp)
+        movq     %r10, TABENT+48(%rsp)
+        movq     %r11, TABENT+56(%rsp)
+
+        leaq    TABENT(%rsp), %rdx
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_sm2_montjadd
+
+        testq   %rbp, %rbp
+        jne     sm2_montjscalarmul_mainloop
+
+// That's the end of the main loop, and we just need to copy the
+// result in "acc" to the output.
+
+        movq    res, %rdi
+        movq    ACC(%rsp), %rax
+        movq    %rax, (%rdi)
+        movq    ACC+8(%rsp), %rax
+        movq    %rax, 8(%rdi)
+        movq    ACC+16(%rsp), %rax
+        movq    %rax, 16(%rdi)
+        movq    ACC+24(%rsp), %rax
+        movq    %rax, 24(%rdi)
+
+        movq    ACC+32(%rsp), %rax
+        movq    %rax, 32(%rdi)
+        movq    ACC+40(%rsp), %rax
+        movq    %rax, 40(%rdi)
+        movq    ACC+48(%rsp), %rax
+        movq    %rax, 48(%rdi)
+        movq    ACC+56(%rsp), %rax
+        movq    %rax, 56(%rdi)
+
+        movq    ACC+64(%rsp), %rax
+        movq    %rax, 64(%rdi)
+        movq    ACC+72(%rsp), %rax
+        movq    %rax, 72(%rdi)
+        movq    ACC+80(%rsp), %rax
+        movq    %rax, 80(%rdi)
+        movq    ACC+88(%rsp), %rax
+        movq    %rax, 88(%rdi)
+
+// Restore stack and registers and return
+
+        addq    $NSPACE, %rsp
+        popq    %rbx
+        popq    %rbp
+        popq    %r12
+        popq    %r13
+        popq    %r14
+        popq    %r15
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+sm2_montjscalarmul_sm2_montjadd:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xe0, %rsp
+        movq    %rdx, %rbp
+        movq    0x40(%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rsi), %r9, %r10
+        mulxq   0x58(%rsi), %r11, %r12
+        movq    0x50(%rsi), %rdx
+        mulxq   0x58(%rsi), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rsi), %rdx
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        sbbq    %rax, %rax
+        movabsq $0xffffffff00000000, %rbx
+        movq    %rax, %rcx
+        andq    %rax, %rbx
+        btr     $0x20, %rcx
+        subq    %rax, %r12
+        sbbq    %rbx, %r13
+        sbbq    %rax, %r14
+        sbbq    %rcx, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x40(%rbp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rbp), %r9, %r10
+        mulxq   0x58(%rbp), %r11, %r12
+        movq    0x50(%rbp), %rdx
+        mulxq   0x58(%rbp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x40(%rbp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rbp), %rdx
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rbp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rbp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rbp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        sbbq    %rax, %rax
+        movabsq $0xffffffff00000000, %rbx
+        movq    %rax, %rcx
+        andq    %rax, %rbx
+        btr     $0x20, %rcx
+        subq    %rax, %r12
+        sbbq    %rbx, %r13
+        sbbq    %rax, %r14
+        sbbq    %rcx, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x20(%rsi), %rdx
+        mulxq   0x40(%rbp), %r8, %r9
+        mulxq   0x48(%rbp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x50(%rbp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x58(%rbp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x28(%rsi), %rdx
+        mulxq   0x40(%rbp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x50(%rbp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x58(%rbp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x30(%rsi), %rdx
+        mulxq   0x40(%rbp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x50(%rbp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x58(%rbp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x38(%rsi), %rdx
+        mulxq   0x40(%rbp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x48(%rbp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x50(%rbp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x58(%rbp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xc0(%rsp)
+        movq    %r13, 0xc8(%rsp)
+        movq    %r14, 0xd0(%rsp)
+        movq    %r15, 0xd8(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x20(%rbp), %rdx
+        mulxq   0x40(%rsi), %r8, %r9
+        mulxq   0x48(%rsi), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x50(%rsi), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x58(%rsi), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x28(%rbp), %rdx
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x58(%rsi), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x30(%rbp), %rdx
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x58(%rsi), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x38(%rbp), %rdx
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x50(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x58(%rsi), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x0(%rbp), %rdx
+        mulxq   (%rsp), %r8, %r9
+        mulxq   0x8(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x10(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x18(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x8(%rbp), %rdx
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x10(%rbp), %rdx
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x18(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x18(%rbp), %rdx
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x18(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        xorl    %ecx, %ecx
+        movq    (%rsi), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0xb0(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0xb8(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x8(%rsi), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x10(%rsi), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb8(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x18(%rsi), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0xb8(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x20(%rsp), %rdx
+        mulxq   (%rsp), %r8, %r9
+        mulxq   0x8(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x10(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x18(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x28(%rsp), %rdx
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x30(%rsp), %rdx
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x18(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x38(%rsp), %rdx
+        mulxq   (%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x18(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        xorl    %ecx, %ecx
+        movq    0xc0(%rsp), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0xb0(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0xb8(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0xc8(%rsp), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0xd0(%rsp), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb8(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0xd8(%rsp), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0xb8(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xc0(%rsp)
+        movq    %r13, 0xc8(%rsp)
+        movq    %r14, 0xd0(%rsp)
+        movq    %r15, 0xd8(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0xa0(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0xa8(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0xb0(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0xb8(%rsp)
+        movq    0x20(%rsp), %rax
+        subq    0xc0(%rsp), %rax
+        movq    0x28(%rsp), %rcx
+        sbbq    0xc8(%rsp), %rcx
+        movq    0x30(%rsp), %r8
+        sbbq    0xd0(%rsp), %r8
+        movq    0x38(%rsp), %r9
+        sbbq    0xd8(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x20(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x28(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x30(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x38(%rsp)
+        movq    0xa0(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0xa8(%rsp), %r9, %r10
+        mulxq   0xb8(%rsp), %r11, %r12
+        movq    0xb0(%rsp), %rdx
+        mulxq   0xb8(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0xb8(%rsp), %rdx
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0xa8(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0xb0(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0xb8(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        sbbq    %rax, %rax
+        movabsq $0xffffffff00000000, %rbx
+        movq    %rax, %rcx
+        andq    %rax, %rbx
+        btr     $0x20, %rcx
+        subq    %rax, %r12
+        sbbq    %rbx, %r13
+        sbbq    %rax, %r14
+        sbbq    %rcx, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x20(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x28(%rsp), %r9, %r10
+        mulxq   0x38(%rsp), %r11, %r12
+        movq    0x30(%rsp), %rdx
+        mulxq   0x38(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x38(%rsp), %rdx
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x28(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x30(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x38(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x80(%rsp), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x70(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x78(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x88(%rsp), %rdx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x90(%rsp), %rdx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x78(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x98(%rsp), %rdx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x78(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x40(%rsp), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x70(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x78(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x48(%rsp), %rdx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x50(%rsp), %rdx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x78(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x58(%rsp), %rdx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x78(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    (%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x40(%rsi), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0xb0(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0xb8(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x48(%rsi), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x50(%rsi), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb8(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x58(%rsi), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0xb8(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    (%rsp), %rax
+        subq    0x40(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x48(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x50(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x58(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        xorl    %ecx, %ecx
+        movq    0xc0(%rsp), %rdx
+        mulxq   0x60(%rsp), %r8, %r9
+        mulxq   0x68(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x70(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x78(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0xc8(%rsp), %rdx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x78(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0xd0(%rsp), %rdx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x78(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0xd8(%rsp), %rdx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x70(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x78(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x40(%rbp), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0xb0(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0xb8(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x48(%rbp), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x50(%rbp), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb8(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x58(%rbp), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0xb8(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x80(%rsp), %rdx
+        mulxq   0x20(%rsp), %r8, %r9
+        mulxq   0x28(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x30(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x38(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x88(%rsp), %rdx
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x38(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x90(%rsp), %rdx
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x38(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x98(%rsp), %rdx
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x30(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x38(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    0x60(%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x68(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x70(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x78(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x40(%rsi), %r8
+        movq    0x48(%rsi), %r9
+        movq    0x50(%rsi), %r10
+        movq    0x58(%rsi), %r11
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+        movq    0x40(%rbp), %r12
+        movq    0x48(%rbp), %r13
+        movq    0x50(%rbp), %r14
+        movq    0x58(%rbp), %r15
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+        cmpq    %rax, %rbx
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+        cmoveq  0xa0(%rsp), %r12
+        cmoveq  0xa8(%rsp), %r13
+        cmoveq  0xb0(%rsp), %r14
+        cmoveq  0xb8(%rsp), %r15
+        movq    (%rsp), %rax
+        cmovbq  (%rsi), %rax
+        cmova   0x0(%rbp), %rax
+        movq    0x8(%rsp), %rbx
+        cmovbq  0x8(%rsi), %rbx
+        cmova   0x8(%rbp), %rbx
+        movq    0x10(%rsp), %rcx
+        cmovbq  0x10(%rsi), %rcx
+        cmova   0x10(%rbp), %rcx
+        movq    0x18(%rsp), %rdx
+        cmovbq  0x18(%rsi), %rdx
+        cmova   0x18(%rbp), %rdx
+        movq    0x80(%rsp), %r8
+        cmovbq  0x20(%rsi), %r8
+        cmova   0x20(%rbp), %r8
+        movq    0x88(%rsp), %r9
+        cmovbq  0x28(%rsi), %r9
+        cmova   0x28(%rbp), %r9
+        movq    0x90(%rsp), %r10
+        cmovbq  0x30(%rsi), %r10
+        cmova   0x30(%rbp), %r10
+        movq    0x98(%rsp), %r11
+        cmovbq  0x38(%rsi), %r11
+        cmova   0x38(%rbp), %r11
+        movq    %rax, (%rdi)
+        movq    %rbx, 0x8(%rdi)
+        movq    %rcx, 0x10(%rdi)
+        movq    %rdx, 0x18(%rdi)
+        movq    %r8, 0x20(%rdi)
+        movq    %r9, 0x28(%rdi)
+        movq    %r10, 0x30(%rdi)
+        movq    %r11, 0x38(%rdi)
+        movq    %r12, 0x40(%rdi)
+        movq    %r13, 0x48(%rdi)
+        movq    %r14, 0x50(%rdi)
+        movq    %r15, 0x58(%rdi)
+        addq    $0xe0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+sm2_montjscalarmul_sm2_montjdouble:
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xc0, %rsp
+        movq    0x40(%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rsi), %r9, %r10
+        mulxq   0x58(%rsi), %r11, %r12
+        movq    0x50(%rsi), %rdx
+        mulxq   0x58(%rsi), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x40(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rsi), %rdx
+        mulxq   0x48(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x20(%rsi), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x28(%rsi), %r9, %r10
+        mulxq   0x38(%rsi), %r11, %r12
+        movq    0x30(%rsi), %rdx
+        mulxq   0x38(%rsi), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x20(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x38(%rsi), %rdx
+        mulxq   0x28(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x28(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x30(%rsi), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x38(%rsi), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    (%rsi), %rax
+        subq    (%rsp), %rax
+        movq    0x8(%rsi), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x10(%rsi), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x18(%rsi), %r9
+        sbbq    0x18(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        movq    (%rsi), %rax
+        addq    (%rsp), %rax
+        movq    0x8(%rsi), %rcx
+        adcq    0x8(%rsp), %rcx
+        movq    0x10(%rsi), %r8
+        adcq    0x10(%rsp), %r8
+        movq    0x18(%rsi), %r9
+        adcq    0x18(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        subq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        sbbq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        sbbq    %r11, %r8
+        movq    %r8, 0x50(%rsp)
+        sbbq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x60(%rsp), %rdx
+        mulxq   0x40(%rsp), %r8, %r9
+        mulxq   0x48(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x50(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x58(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x68(%rsp), %rdx
+        mulxq   0x40(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x50(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x58(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x70(%rsp), %rdx
+        mulxq   0x40(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x50(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x58(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x78(%rsp), %rdx
+        mulxq   0x40(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x50(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x58(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        xorq    %r11, %r11
+        movq    0x20(%rsi), %rax
+        addq    0x40(%rsi), %rax
+        movq    0x28(%rsi), %rcx
+        adcq    0x48(%rsi), %rcx
+        movq    0x30(%rsi), %r8
+        adcq    0x50(%rsi), %r8
+        movq    0x38(%rsi), %r9
+        adcq    0x58(%rsi), %r9
+        adcq    %r11, %r11
+        subq    $0xffffffffffffffff, %rax
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r10, %rcx
+        sbbq    $0xffffffffffffffff, %r8
+        movabsq $0xfffffffeffffffff, %rdx
+        sbbq    %rdx, %r9
+        sbbq    $0x0, %r11
+        andq    %r11, %r10
+        andq    %r11, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x50(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x20(%rsp), %rdx
+        mulxq   (%rsi), %r8, %r9
+        mulxq   0x8(%rsi), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0x10(%rsi), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0x18(%rsi), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x28(%rsp), %rdx
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x18(%rsi), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x30(%rsp), %rdx
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x18(%rsi), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x38(%rsp), %rdx
+        mulxq   (%rsi), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0x8(%rsi), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0x10(%rsi), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0x18(%rsi), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x60(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x68(%rsp), %r9, %r10
+        mulxq   0x78(%rsp), %r11, %r12
+        movq    0x70(%rsp), %rdx
+        mulxq   0x78(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x60(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x78(%rsp), %rdx
+        mulxq   0x68(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x68(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x70(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x78(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    0x40(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x48(%rsp), %r9, %r10
+        mulxq   0x58(%rsp), %r11, %r12
+        movq    0x50(%rsp), %rdx
+        mulxq   0x58(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x40(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x58(%rsp), %rdx
+        mulxq   0x48(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x48(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x50(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x58(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    $0xffffffffffffffff, %r8
+        movq    %r8, %r10
+        subq    0xa0(%rsp), %r8
+        movabsq $0xffffffff00000000, %r9
+        sbbq    0xa8(%rsp), %r9
+        sbbq    0xb0(%rsp), %r10
+        movabsq $0xfffffffeffffffff, %r11
+        sbbq    0xb8(%rsp), %r11
+        xorl    %r12d, %r12d
+        movq    $0x9, %rdx
+        mulxq   %r8, %r8, %rax
+        mulxq   %r9, %r9, %rcx
+        addq    %rax, %r9
+        mulxq   %r10, %r10, %rax
+        adcq    %rcx, %r10
+        mulxq   %r11, %r11, %rcx
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        movq    $0xc, %rdx
+        xorl    %eax, %eax
+        mulxq   0x80(%rsp), %rax, %rcx
+        adcxq   %rax, %r8
+        adoxq   %rcx, %r9
+        mulxq   0x88(%rsp), %rax, %rcx
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+        mulxq   0x90(%rsp), %rax, %rcx
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+        mulxq   0x98(%rsp), %rax, %rdx
+        adcxq   %rax, %r11
+        adoxq   %r12, %rdx
+        adcq    $0x1, %rdx
+        movq    %rdx, %rax
+        shlq    $0x20, %rax
+        movq    %rax, %rcx
+        subq    %rdx, %rax
+        addq    %rdx, %r8
+        adcq    %rax, %r9
+        adcq    $0x0, %r10
+        adcq    %rcx, %r11
+        sbbq    %rdx, %rdx
+        notq    %rdx
+        movabsq $0xffffffff00000000, %rax
+        andq    %rdx, %rax
+        movq    %rdx, %rcx
+        btr     $0x20, %rcx
+        addq    %rdx, %r8
+        movq    %r8, 0xa0(%rsp)
+        adcq    %rax, %r9
+        movq    %r9, 0xa8(%rsp)
+        adcq    %rdx, %r10
+        movq    %r10, 0xb0(%rsp)
+        adcq    %rcx, %r11
+        movq    %r11, 0xb8(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x50(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        movq    0x20(%rsp), %rdx
+        mulxq   %rdx, %r8, %r15
+        mulxq   0x28(%rsp), %r9, %r10
+        mulxq   0x38(%rsp), %r11, %r12
+        movq    0x30(%rsp), %rdx
+        mulxq   0x38(%rsp), %r13, %r14
+        xorl    %ecx, %ecx
+        mulxq   0x20(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        movq    0x38(%rsp), %rdx
+        mulxq   0x28(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        adcxq   %rcx, %r13
+        adoxq   %rcx, %r14
+        adcq    %rcx, %r14
+        xorl    %ecx, %ecx
+        adcxq   %r9, %r9
+        adoxq   %r15, %r9
+        movq    0x28(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r10, %r10
+        adoxq   %rax, %r10
+        adcxq   %r11, %r11
+        adoxq   %rdx, %r11
+        movq    0x30(%rsp), %rdx
+        mulxq   %rdx, %rax, %rdx
+        adcxq   %r12, %r12
+        adoxq   %rax, %r12
+        adcxq   %r13, %r13
+        adoxq   %rdx, %r13
+        movq    0x38(%rsp), %rdx
+        mulxq   %rdx, %rax, %r15
+        adcxq   %r14, %r14
+        adoxq   %rax, %r14
+        adcxq   %rcx, %r15
+        adoxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        xorl    %ecx, %ecx
+        movq    0x60(%rsp), %rdx
+        mulxq   0xa0(%rsp), %r8, %r9
+        mulxq   0xa8(%rsp), %rax, %r10
+        addq    %rax, %r9
+        mulxq   0xb0(%rsp), %rax, %r11
+        adcq    %rax, %r10
+        mulxq   0xb8(%rsp), %rax, %r12
+        adcq    %rax, %r11
+        adcq    %rcx, %r12
+        xorl    %ecx, %ecx
+        movq    0x68(%rsp), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r9
+        adoxq   %rbx, %r10
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb8(%rsp), %rax, %r13
+        adcxq   %rax, %r12
+        adoxq   %rcx, %r13
+        adcxq   %rcx, %r13
+        xorl    %ecx, %ecx
+        movq    0x70(%rsp), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r10
+        adoxq   %rbx, %r11
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb8(%rsp), %rax, %r14
+        adcxq   %rax, %r13
+        adoxq   %rcx, %r14
+        adcxq   %rcx, %r14
+        xorl    %ecx, %ecx
+        movq    0x78(%rsp), %rdx
+        mulxq   0xa0(%rsp), %rax, %rbx
+        adcxq   %rax, %r11
+        adoxq   %rbx, %r12
+        mulxq   0xa8(%rsp), %rax, %rbx
+        adcxq   %rax, %r12
+        adoxq   %rbx, %r13
+        mulxq   0xb0(%rsp), %rax, %rbx
+        adcxq   %rax, %r13
+        adoxq   %rbx, %r14
+        mulxq   0xb8(%rsp), %rax, %r15
+        adcxq   %rax, %r14
+        adoxq   %rcx, %r15
+        adcxq   %rcx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x20(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x28(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x30(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x38(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rdi)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rdi)
+        adcq    %r11, %r8
+        movq    %r8, 0x50(%rdi)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rdi)
+        movq    0x98(%rsp), %r11
+        movq    %r11, %rdx
+        movq    0x90(%rsp), %r10
+        shldq   $0x2, %r10, %r11
+        movq    0x88(%rsp), %r9
+        shldq   $0x2, %r9, %r10
+        movq    0x80(%rsp), %r8
+        shldq   $0x2, %r8, %r9
+        shlq    $0x2, %r8
+        shrq    $0x3e, %rdx
+        addq    $0x1, %rdx
+        subq    0xa0(%rsp), %r8
+        sbbq    0xa8(%rsp), %r9
+        sbbq    0xb0(%rsp), %r10
+        sbbq    0xb8(%rsp), %r11
+        sbbq    $0x0, %rdx
+        movq    %rdx, %rax
+        shlq    $0x20, %rax
+        movq    %rax, %rcx
+        subq    %rdx, %rax
+        addq    %rdx, %r8
+        adcq    %rax, %r9
+        adcq    $0x0, %r10
+        adcq    %rcx, %r11
+        sbbq    %rdx, %rdx
+        notq    %rdx
+        movabsq $0xffffffff00000000, %rax
+        andq    %rdx, %rax
+        movq    %rdx, %rcx
+        btr     $0x20, %rcx
+        addq    %rdx, %r8
+        movq    %r8, (%rdi)
+        adcq    %rax, %r9
+        movq    %r9, 0x8(%rdi)
+        adcq    %rdx, %r10
+        movq    %r10, 0x10(%rdi)
+        adcq    %rcx, %r11
+        movq    %r11, 0x18(%rdi)
+        movq    $0xffffffffffffffff, %r8
+        movq    %r8, %r10
+        subq    (%rsp), %r8
+        movabsq $0xffffffff00000000, %r9
+        sbbq    0x8(%rsp), %r9
+        sbbq    0x10(%rsp), %r10
+        movabsq $0xfffffffeffffffff, %r11
+        sbbq    0x18(%rsp), %r11
+        movq    %r11, %r12
+        shldq   $0x3, %r10, %r11
+        shldq   $0x3, %r9, %r10
+        shldq   $0x3, %r8, %r9
+        shlq    $0x3, %r8
+        shrq    $0x3d, %r12
+        movq    $0x3, %rdx
+        xorl    %eax, %eax
+        mulxq   0x60(%rsp), %rax, %rcx
+        adcxq   %rax, %r8
+        adoxq   %rcx, %r9
+        mulxq   0x68(%rsp), %rax, %rcx
+        adcxq   %rax, %r9
+        adoxq   %rcx, %r10
+        mulxq   0x70(%rsp), %rax, %rcx
+        adcxq   %rax, %r10
+        adoxq   %rcx, %r11
+        mulxq   0x78(%rsp), %rax, %rdx
+        adcxq   %rax, %r11
+        adoxq   %r12, %rdx
+        adcq    $0x1, %rdx
+        movq    %rdx, %rax
+        shlq    $0x20, %rax
+        movq    %rax, %rcx
+        subq    %rdx, %rax
+        addq    %rdx, %r8
+        adcq    %rax, %r9
+        adcq    $0x0, %r10
+        adcq    %rcx, %r11
+        sbbq    %rdx, %rdx
+        notq    %rdx
+        movabsq $0xffffffff00000000, %rax
+        andq    %rdx, %rax
+        movq    %rdx, %rcx
+        btr     $0x20, %rcx
+        addq    %rdx, %r8
+        movq    %r8, 0x20(%rdi)
+        adcq    %rax, %r9
+        movq    %r9, 0x28(%rdi)
+        adcq    %rdx, %r10
+        movq    %r10, 0x30(%rdi)
+        adcq    %rcx, %r11
+        movq    %r11, 0x38(%rdi)
+        addq    $0xc0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul_alt.S
new file mode 100644
index 00000000000..e946fbac25d
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul_alt.S
@@ -0,0 +1,4526 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Montgomery-Jacobian form scalar multiplication for GM/T 0003-2012 curve SM2
+// Input scalar[4], point[12]; output res[12]
+//
+// extern void sm2_montjscalarmul_alt
+//   (uint64_t res[static 12],
+//    uint64_t scalar[static 4],
+//    uint64_t point[static 12]);
+//
+// This function is a variant of its affine point version sm2_scalarmul.
+// Here, input and output points are assumed to be in Jacobian form with
+// their coordinates in the Montgomery domain. Thus, if priming indicates
+// Montgomery form, x' = (2^256 * x) mod p_sm2 etc., each point argument
+// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when
+// z' is nonzero or the point at infinity (group identity) if z' = 0.
+//
+// Given scalar = n and point = P, assumed to be on the NIST elliptic
+// curve SM2, returns a representation of n * P. If the result is the
+// point at infinity (either because the input point was or because the
+// scalar was a multiple of p_sm2) then the output is guaranteed to
+// represent the point at infinity, i.e. to have its z coordinate zero.
+//
+// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point
+// Microsoft x64 ABI:   RCX = res, RDX = scalar, R8 = point
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjscalarmul_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjscalarmul_alt)
+
+        .text
+        .balign 4
+
+// Size of individual field elements
+
+#define NUMSIZE 32
+
+// Intermediate variables on the stack. Uppercase syntactic variants
+// make x86_att version simpler to generate.
+
+#define SCALARB (0*NUMSIZE)
+#define scalarb (0*NUMSIZE)(%rsp)
+#define ACC (1*NUMSIZE)
+#define acc (1*NUMSIZE)(%rsp)
+#define TABENT (4*NUMSIZE)
+#define tabent (4*NUMSIZE)(%rsp)
+
+#define TAB (7*NUMSIZE)
+#define tab (7*NUMSIZE)(%rsp)
+
+#define res (31*NUMSIZE)(%rsp)
+
+#define NSPACE (32*NUMSIZE)
+
+// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator,
+// which doesn't accept repetitions, assembler macros etc.
+
+#define selectblock(I)                          \
+        cmpq    $I, %rdi ;                         \
+        cmovzq  TAB+96*(I-1)(%rsp), %rax ;        \
+        cmovzq  TAB+96*(I-1)+8(%rsp), %rbx ;      \
+        cmovzq  TAB+96*(I-1)+16(%rsp), %rcx ;     \
+        cmovzq  TAB+96*(I-1)+24(%rsp), %rdx ;     \
+        cmovzq  TAB+96*(I-1)+32(%rsp), %r8 ;      \
+        cmovzq  TAB+96*(I-1)+40(%rsp), %r9 ;      \
+        cmovzq  TAB+96*(I-1)+48(%rsp), %r10 ;     \
+        cmovzq  TAB+96*(I-1)+56(%rsp), %r11 ;     \
+        cmovzq  TAB+96*(I-1)+64(%rsp), %r12 ;     \
+        cmovzq  TAB+96*(I-1)+72(%rsp), %r13 ;     \
+        cmovzq  TAB+96*(I-1)+80(%rsp), %r14 ;     \
+        cmovzq  TAB+96*(I-1)+88(%rsp), %r15
+
+S2N_BN_SYMBOL(sm2_montjscalarmul_alt):
+        _CET_ENDBR
+
+// The Windows version literally calls the standard ABI version.
+// This simplifies the proofs since subroutine offsets are fixed.
+
+#if WINDOWS_ABI
+        pushq   %rdi
+        pushq   %rsi
+        movq    %rcx, %rdi
+        movq    %rdx, %rsi
+        movq    %r8, %rdx
+        callq   sm2_montjscalarmul_alt_standard
+        popq   %rsi
+        popq   %rdi
+        ret
+
+sm2_montjscalarmul_alt_standard:
+#endif
+
+// Real start of the standard ABI code.
+
+        pushq   %r15
+        pushq   %r14
+        pushq   %r13
+        pushq   %r12
+        pushq   %rbp
+        pushq   %rbx
+
+        subq    $NSPACE, %rsp
+
+// Preserve the "res" and "point" input arguments. We load and process the
+// scalar immediately so we don't bother preserving that input argument.
+// Also, "point" is only needed early on and so its register gets re-used.
+
+        movq    %rdx, %rbx
+        movq    %rdi, res
+
+// Load the digits of group order n_sm2 = [%r15;%r14;%r13;%r12]
+
+        movq    $0x53bbf40939d54123, %r12
+        movq    $0x7203df6b21c6052b, %r13
+        movq    $0xffffffffffffffff, %r14
+        movq    $0xfffffffeffffffff, %r15
+
+// First, reduce the input scalar mod n_sm2, i.e. conditionally subtract n_sm2
+
+        movq    (%rsi), %r8
+        subq    %r12, %r8
+        movq    8(%rsi), %r9
+        sbbq    %r13, %r9
+        movq    16(%rsi), %r10
+        sbbq    %r14, %r10
+        movq    24(%rsi), %r11
+        sbbq    %r15, %r11
+
+        cmovcq  (%rsi), %r8
+        cmovcq  8(%rsi), %r9
+        cmovcq  16(%rsi), %r10
+        cmovcq  24(%rsi), %r11
+
+// Now if the top bit of the reduced scalar is set, negate it mod n_sm2,
+// i.e. do n |-> n_sm2 - n. Remember the sign in %rbp so we can
+// correspondingly negate the point below.
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        movq    %r11, %rbp
+        shrq    $63, %rbp
+        cmovnzq %r12, %r8
+        cmovnzq %r13, %r9
+        cmovnzq %r14, %r10
+        cmovnzq %r15, %r11
+
+// In either case then add the recoding constant 0x08888...888 to allow
+// signed digits.
+
+        movq    $0x8888888888888888, %rax
+        addq    %rax, %r8
+        adcq    %rax, %r9
+        adcq    %rax, %r10
+        adcq    %rax, %r11
+        btc     $63, %r11
+
+        movq    %r8, SCALARB(%rsp)
+        movq    %r9, SCALARB+8(%rsp)
+        movq    %r10, SCALARB+16(%rsp)
+        movq    %r11, SCALARB+24(%rsp)
+
+// Set the tab[0] table entry to the input point = 1 * P, except
+// that we negate it if the top bit of the scalar was set. This
+// negation takes care over the y = 0 case to maintain all the
+// coordinates < p_sm2 throughout, even though triples (x,y,z)
+// with y = 0 can only represent a point on the curve when z = 0
+// and it represents the point at infinity regardless of x and y.
+
+        movq    (%rbx), %rax
+        movq    %rax, TAB(%rsp)
+        movq    8(%rbx), %rax
+        movq    %rax, TAB+8(%rsp)
+        movq    16(%rbx), %rax
+        movq    %rax, TAB+16(%rsp)
+        movq    24(%rbx), %rax
+        movq    %rax, TAB+24(%rsp)
+
+        movq    32(%rbx), %r12
+        movq    %r12, %rax
+        movq    40(%rbx), %r13
+        orq     %r13, %rax
+        movq    48(%rbx), %r14
+        movq    %r14, %rcx
+        movq    56(%rbx), %r15
+        orq     %r15, %rcx
+        orq     %rcx, %rax
+        cmovzq  %rax, %rbp
+
+        xorl    %r11d, %r11d
+        movl    $0x00000000ffffffff, %r9d
+        notq    %r11
+        movq    %r11, %r8
+        movq    %r11, %r10
+        xorq    %r8, %r9
+        btr     $32, %r11
+
+        subq    %r12, %r8
+        sbbq    %r13, %r9
+        sbbq    %r14, %r10
+        sbbq    %r15, %r11
+        testq   %rbp, %rbp
+        cmovzq  %r12, %r8
+        cmovzq  %r13, %r9
+        cmovzq  %r14, %r10
+        cmovzq  %r15, %r11
+        movq    %r8, TAB+32(%rsp)
+        movq    %r9, TAB+40(%rsp)
+        movq    %r10, TAB+48(%rsp)
+        movq    %r11, TAB+56(%rsp)
+
+        movq    64(%rbx), %rax
+        movq    %rax, TAB+64(%rsp)
+        movq    72(%rbx), %rax
+        movq    %rax, TAB+72(%rsp)
+        movq    80(%rbx), %rax
+        movq    %rax, TAB+80(%rsp)
+        movq    88(%rbx), %rax
+        movq    %rax, TAB+88(%rsp)
+
+// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P
+
+        leaq    TAB+96*1(%rsp), %rdi
+        leaq    TAB(%rsp), %rsi
+        callq   sm2_montjscalarmul_alt_sm2_montjdouble
+
+        leaq    TAB+96*2(%rsp), %rdi
+        leaq    TAB+96*1(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   sm2_montjscalarmul_alt_sm2_montjadd
+
+        leaq    TAB+96*3(%rsp), %rdi
+        leaq    TAB+96*1(%rsp), %rsi
+        callq   sm2_montjscalarmul_alt_sm2_montjdouble
+
+        leaq    TAB+96*4(%rsp), %rdi
+        leaq    TAB+96*3(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   sm2_montjscalarmul_alt_sm2_montjadd
+
+        leaq    TAB+96*5(%rsp), %rdi
+        leaq    TAB+96*2(%rsp), %rsi
+        callq   sm2_montjscalarmul_alt_sm2_montjdouble
+
+        leaq    TAB+96*6(%rsp), %rdi
+        leaq    TAB+96*5(%rsp), %rsi
+        leaq    TAB(%rsp), %rdx
+        callq   sm2_montjscalarmul_alt_sm2_montjadd
+
+        leaq    TAB+96*7(%rsp), %rdi
+        leaq    TAB+96*3(%rsp), %rsi
+        callq   sm2_montjscalarmul_alt_sm2_montjdouble
+
+// Set up accumulator as table entry for top 4 bits (constant-time indexing)
+
+        movq    SCALARB+24(%rsp), %rdi
+        shrq    $60, %rdi
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        xorl    %r8d, %r8d
+        xorl    %r9d, %r9d
+        xorl    %r10d, %r10d
+        xorl    %r11d, %r11d
+        xorl    %r12d, %r12d
+        xorl    %r13d, %r13d
+        xorl    %r14d, %r14d
+        xorl    %r15d, %r15d
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+
+        movq     %rax, ACC(%rsp)
+        movq     %rbx, ACC+8(%rsp)
+        movq     %rcx, ACC+16(%rsp)
+        movq     %rdx, ACC+24(%rsp)
+        movq     %r8, ACC+32(%rsp)
+        movq     %r9, ACC+40(%rsp)
+        movq     %r10, ACC+48(%rsp)
+        movq     %r11, ACC+56(%rsp)
+        movq     %r12, ACC+64(%rsp)
+        movq     %r13, ACC+72(%rsp)
+        movq     %r14, ACC+80(%rsp)
+        movq     %r15, ACC+88(%rsp)
+
+// Main loop over size-4 bitfield
+
+        movl    $252, %ebp
+
+sm2_montjscalarmul_alt_mainloop:
+        subq    $4, %rbp
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_alt_sm2_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_alt_sm2_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_alt_sm2_montjdouble
+
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_alt_sm2_montjdouble
+
+        movq    %rbp, %rax
+        shrq    $6, %rax
+        movq    (%rsp,%rax,8), %rdi
+        movq    %rbp, %rcx
+        shrq    %cl, %rdi
+        andq    $15, %rdi
+
+        subq    $8, %rdi
+        sbbq    %rsi, %rsi // %rsi = sign of digit (-1 = negative)
+        xorq    %rsi, %rdi
+        subq    %rsi, %rdi // %rdi = absolute value of digit
+
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        xorl    %r8d, %r8d
+        xorl    %r9d, %r9d
+        xorl    %r10d, %r10d
+        xorl    %r11d, %r11d
+        xorl    %r12d, %r12d
+        xorl    %r13d, %r13d
+        xorl    %r14d, %r14d
+        xorl    %r15d, %r15d
+
+        selectblock(1)
+        selectblock(2)
+        selectblock(3)
+        selectblock(4)
+        selectblock(5)
+        selectblock(6)
+        selectblock(7)
+        selectblock(8)
+
+// Store it to "tabent" with the y coordinate optionally negated
+// Again, do it carefully to give coordinates < p_sm2 even in
+// the degenerate case y = 0 (when z = 0 for points on the curve).
+
+        movq     %rax, TABENT(%rsp)
+        movq     %rbx, TABENT+8(%rsp)
+        movq     %rcx, TABENT+16(%rsp)
+        movq     %rdx, TABENT+24(%rsp)
+
+        movq     %r12, TABENT+64(%rsp)
+        movq     %r13, TABENT+72(%rsp)
+        movq     %r14, TABENT+80(%rsp)
+        movq     %r15, TABENT+88(%rsp)
+
+        xorl    %r15d, %r15d
+        movq    %r8, %rax
+        movl    $0x00000000ffffffff, %r13d
+        orq     %r9, %rax
+        notq    %r15
+        movq    %r10, %rcx
+        movq    %r15, %r12
+        orq     %r11, %rcx
+        movq    %r15, %r14
+        xorq    %r12, %r13
+        btr     $32, %r15
+        orq     %rcx, %rax
+        cmovzq  %rax, %rsi
+
+        subq    %r8, %r12
+        sbbq    %r9, %r13
+        sbbq    %r10, %r14
+        sbbq    %r11, %r15
+
+        testq   %rsi, %rsi
+        cmovnzq  %r12, %r8
+        cmovnzq  %r13, %r9
+        cmovnzq  %r14, %r10
+        cmovnzq  %r15, %r11
+
+        movq     %r8, TABENT+32(%rsp)
+        movq     %r9, TABENT+40(%rsp)
+        movq     %r10, TABENT+48(%rsp)
+        movq     %r11, TABENT+56(%rsp)
+
+        leaq    TABENT(%rsp), %rdx
+        leaq    ACC(%rsp), %rsi
+        leaq    ACC(%rsp), %rdi
+        callq   sm2_montjscalarmul_alt_sm2_montjadd
+
+        testq   %rbp, %rbp
+        jne     sm2_montjscalarmul_alt_mainloop
+
+// That's the end of the main loop, and we just need to copy the
+// result in "acc" to the output.
+
+        movq    res, %rdi
+        movq    ACC(%rsp), %rax
+        movq    %rax, (%rdi)
+        movq    ACC+8(%rsp), %rax
+        movq    %rax, 8(%rdi)
+        movq    ACC+16(%rsp), %rax
+        movq    %rax, 16(%rdi)
+        movq    ACC+24(%rsp), %rax
+        movq    %rax, 24(%rdi)
+
+        movq    ACC+32(%rsp), %rax
+        movq    %rax, 32(%rdi)
+        movq    ACC+40(%rsp), %rax
+        movq    %rax, 40(%rdi)
+        movq    ACC+48(%rsp), %rax
+        movq    %rax, 48(%rdi)
+        movq    ACC+56(%rsp), %rax
+        movq    %rax, 56(%rdi)
+
+        movq    ACC+64(%rsp), %rax
+        movq    %rax, 64(%rdi)
+        movq    ACC+72(%rsp), %rax
+        movq    %rax, 72(%rdi)
+        movq    ACC+80(%rsp), %rax
+        movq    %rax, 80(%rdi)
+        movq    ACC+88(%rsp), %rax
+        movq    %rax, 88(%rdi)
+
+// Restore stack and registers and return
+
+        addq    $NSPACE, %rsp
+        popq    %rbx
+        popq    %rbp
+        popq    %r12
+        popq    %r13
+        popq    %r14
+        popq    %r15
+        ret
+
+// Local copies of subroutines, complete clones at the moment
+
+sm2_montjscalarmul_alt_sm2_montjadd:
+        pushq   %rbx
+        pushq   %rbp
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xe0, %rsp
+        movq    %rdx, %rbp
+        movq    0x40(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rsi), %rbx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x40(%rbp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rbp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rbp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rbp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rbp), %rbx
+        movq    0x48(%rbp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rbp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rbp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rbp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    0x40(%rbp), %rax
+        mulq     0x20(%rsi)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0x40(%rbp), %rax
+        mulq     0x28(%rsi)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x48(%rbp), %rax
+        mulq     0x20(%rsi)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0x40(%rbp), %rax
+        mulq     0x30(%rsi)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x48(%rbp), %rax
+        mulq     0x28(%rsi)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x50(%rbp), %rax
+        mulq     0x20(%rsi)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0x40(%rbp), %rax
+        mulq     0x38(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x48(%rbp), %rax
+        mulq     0x30(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x50(%rbp), %rax
+        mulq     0x28(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x58(%rbp), %rax
+        mulq     0x20(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x48(%rbp), %rax
+        mulq     0x38(%rsi)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rbp), %rax
+        mulq     0x30(%rsi)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x58(%rbp), %rax
+        mulq     0x28(%rsi)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x50(%rbp), %rax
+        mulq     0x38(%rsi)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rbp), %rax
+        mulq     0x30(%rsi)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x58(%rbp), %rax
+        mulq     0x38(%rsi)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xc0(%rsp)
+        movq    %r13, 0xc8(%rsp)
+        movq    %r14, 0xd0(%rsp)
+        movq    %r15, 0xd8(%rsp)
+        movq    0x40(%rsi), %rax
+        mulq     0x20(%rbp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0x40(%rsi), %rax
+        mulq     0x28(%rbp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x48(%rsi), %rax
+        mulq     0x20(%rbp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0x40(%rsi), %rax
+        mulq     0x30(%rbp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x48(%rsi), %rax
+        mulq     0x28(%rbp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x50(%rsi), %rax
+        mulq     0x20(%rbp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0x40(%rsi), %rax
+        mulq     0x38(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x48(%rsi), %rax
+        mulq     0x30(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x50(%rsi), %rax
+        mulq     0x28(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x58(%rsi), %rax
+        mulq     0x20(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x48(%rsi), %rax
+        mulq     0x38(%rbp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rsi), %rax
+        mulq     0x30(%rbp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x58(%rsi), %rax
+        mulq     0x28(%rbp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x50(%rsi), %rax
+        mulq     0x38(%rbp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rsi), %rax
+        mulq     0x30(%rbp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x58(%rsi), %rax
+        mulq     0x38(%rbp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    (%rsp), %rax
+        mulq     0x0(%rbp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    (%rsp), %rax
+        mulq     0x8(%rbp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x8(%rsp), %rax
+        mulq     0x0(%rbp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    (%rsp), %rax
+        mulq     0x10(%rbp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x8(%rsp), %rax
+        mulq     0x8(%rbp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x10(%rsp), %rax
+        mulq     0x0(%rbp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    (%rsp), %rax
+        mulq     0x18(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x8(%rsp), %rax
+        mulq     0x10(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x10(%rsp), %rax
+        mulq     0x8(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x18(%rsp), %rax
+        mulq     0x0(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x8(%rsp), %rax
+        mulq     0x18(%rbp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rsp), %rax
+        mulq     0x10(%rbp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x18(%rsp), %rax
+        mulq     0x8(%rbp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x10(%rsp), %rax
+        mulq     0x18(%rbp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rsp), %rax
+        mulq     0x10(%rbp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x18(%rsp), %rax
+        mulq     0x18(%rbp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    0xa0(%rsp), %rax
+        mulq     (%rsi)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0xa0(%rsp), %rax
+        mulq     0x8(%rsi)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xa8(%rsp), %rax
+        mulq     (%rsi)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0xa0(%rsp), %rax
+        mulq     0x10(%rsi)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0xa8(%rsp), %rax
+        mulq     0x8(%rsi)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0xb0(%rsp), %rax
+        mulq     (%rsi)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0xa0(%rsp), %rax
+        mulq     0x18(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0xa8(%rsp), %rax
+        mulq     0x10(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb0(%rsp), %rax
+        mulq     0x8(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb8(%rsp), %rax
+        mulq     (%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq     0x18(%rsi)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq     0x10(%rsi)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0xb8(%rsp), %rax
+        mulq     0x8(%rsi)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0xb0(%rsp), %rax
+        mulq     0x18(%rsi)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0x10(%rsi)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0x18(%rsi)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    (%rsp), %rax
+        mulq     0x20(%rsp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    (%rsp), %rax
+        mulq     0x28(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x8(%rsp), %rax
+        mulq     0x20(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    (%rsp), %rax
+        mulq     0x30(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x8(%rsp), %rax
+        mulq     0x28(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x10(%rsp), %rax
+        mulq     0x20(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    (%rsp), %rax
+        mulq     0x38(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x8(%rsp), %rax
+        mulq     0x30(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x10(%rsp), %rax
+        mulq     0x28(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x18(%rsp), %rax
+        mulq     0x20(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x8(%rsp), %rax
+        mulq     0x38(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rsp), %rax
+        mulq     0x30(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x18(%rsp), %rax
+        mulq     0x28(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x10(%rsp), %rax
+        mulq     0x38(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rsp), %rax
+        mulq     0x30(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x18(%rsp), %rax
+        mulq     0x38(%rsp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    0xa0(%rsp), %rax
+        mulq     0xc0(%rsp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0xa0(%rsp), %rax
+        mulq     0xc8(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xa8(%rsp), %rax
+        mulq     0xc0(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0xa0(%rsp), %rax
+        mulq     0xd0(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0xa8(%rsp), %rax
+        mulq     0xc8(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0xb0(%rsp), %rax
+        mulq     0xc0(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0xa0(%rsp), %rax
+        mulq     0xd8(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0xa8(%rsp), %rax
+        mulq     0xd0(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb0(%rsp), %rax
+        mulq     0xc8(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb8(%rsp), %rax
+        mulq     0xc0(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq     0xd8(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq     0xd0(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0xb8(%rsp), %rax
+        mulq     0xc8(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0xb0(%rsp), %rax
+        mulq     0xd8(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0xd0(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0xd8(%rsp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xc0(%rsp)
+        movq    %r13, 0xc8(%rsp)
+        movq    %r14, 0xd0(%rsp)
+        movq    %r15, 0xd8(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0xa0(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0xa8(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0xb0(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0xb8(%rsp)
+        movq    0x20(%rsp), %rax
+        subq    0xc0(%rsp), %rax
+        movq    0x28(%rsp), %rcx
+        sbbq    0xc8(%rsp), %rcx
+        movq    0x30(%rsp), %r8
+        sbbq    0xd0(%rsp), %r8
+        movq    0x38(%rsp), %r9
+        sbbq    0xd8(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x20(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x28(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x30(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x38(%rsp)
+        movq    0xa0(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0xb8(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0xb0(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0xa0(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0xb8(%rsp), %rbx
+        movq    0xa8(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0xa8(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0xb0(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0xb8(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x20(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x38(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x30(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x38(%rsp), %rbx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x30(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x38(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x60(%rsp), %rax
+        mulq     0x80(%rsp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0x60(%rsp), %rax
+        mulq     0x88(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x68(%rsp), %rax
+        mulq     0x80(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0x60(%rsp), %rax
+        mulq     0x90(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x68(%rsp), %rax
+        mulq     0x88(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x70(%rsp), %rax
+        mulq     0x80(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0x60(%rsp), %rax
+        mulq     0x98(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x68(%rsp), %rax
+        mulq     0x90(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x70(%rsp), %rax
+        mulq     0x88(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x78(%rsp), %rax
+        mulq     0x80(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq     0x98(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq     0x90(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x78(%rsp), %rax
+        mulq     0x88(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x70(%rsp), %rax
+        mulq     0x98(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x78(%rsp), %rax
+        mulq     0x90(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x78(%rsp), %rax
+        mulq     0x98(%rsp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x60(%rsp), %rax
+        mulq     0x40(%rsp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0x60(%rsp), %rax
+        mulq     0x48(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x68(%rsp), %rax
+        mulq     0x40(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0x60(%rsp), %rax
+        mulq     0x50(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x68(%rsp), %rax
+        mulq     0x48(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x70(%rsp), %rax
+        mulq     0x40(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0x60(%rsp), %rax
+        mulq     0x58(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x68(%rsp), %rax
+        mulq     0x50(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x70(%rsp), %rax
+        mulq     0x48(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x78(%rsp), %rax
+        mulq     0x40(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq     0x58(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq     0x50(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x78(%rsp), %rax
+        mulq     0x48(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x70(%rsp), %rax
+        mulq     0x58(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x78(%rsp), %rax
+        mulq     0x50(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x78(%rsp), %rax
+        mulq     0x58(%rsp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    (%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x80(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x88(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x90(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x98(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        movq    0xa0(%rsp), %rax
+        mulq     0x40(%rsi)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0xa0(%rsp), %rax
+        mulq     0x48(%rsi)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xa8(%rsp), %rax
+        mulq     0x40(%rsi)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0xa0(%rsp), %rax
+        mulq     0x50(%rsi)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0xa8(%rsp), %rax
+        mulq     0x48(%rsi)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0xb0(%rsp), %rax
+        mulq     0x40(%rsi)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0xa0(%rsp), %rax
+        mulq     0x58(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0xa8(%rsp), %rax
+        mulq     0x50(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb0(%rsp), %rax
+        mulq     0x48(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb8(%rsp), %rax
+        mulq     0x40(%rsi)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq     0x58(%rsi)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq     0x50(%rsi)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0xb8(%rsp), %rax
+        mulq     0x48(%rsi)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0xb0(%rsp), %rax
+        mulq     0x58(%rsi)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0x50(%rsi)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0x58(%rsi)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    (%rsp), %rax
+        subq    0x40(%rsp), %rax
+        movq    0x8(%rsp), %rcx
+        sbbq    0x48(%rsp), %rcx
+        movq    0x10(%rsp), %r8
+        sbbq    0x50(%rsp), %r8
+        movq    0x18(%rsp), %r9
+        sbbq    0x58(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, (%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x8(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x10(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x18(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x60(%rsp), %rax
+        mulq     0xc0(%rsp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0x60(%rsp), %rax
+        mulq     0xc8(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x68(%rsp), %rax
+        mulq     0xc0(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0x60(%rsp), %rax
+        mulq     0xd0(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x68(%rsp), %rax
+        mulq     0xc8(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x70(%rsp), %rax
+        mulq     0xc0(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0x60(%rsp), %rax
+        mulq     0xd8(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x68(%rsp), %rax
+        mulq     0xd0(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x70(%rsp), %rax
+        mulq     0xc8(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x78(%rsp), %rax
+        mulq     0xc0(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x68(%rsp), %rax
+        mulq     0xd8(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x70(%rsp), %rax
+        mulq     0xd0(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x78(%rsp), %rax
+        mulq     0xc8(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x70(%rsp), %rax
+        mulq     0xd8(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x78(%rsp), %rax
+        mulq     0xd0(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x78(%rsp), %rax
+        mulq     0xd8(%rsp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0xa0(%rsp), %rax
+        mulq     0x40(%rbp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0xa0(%rsp), %rax
+        mulq     0x48(%rbp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xa8(%rsp), %rax
+        mulq     0x40(%rbp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0xa0(%rsp), %rax
+        mulq     0x50(%rbp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0xa8(%rsp), %rax
+        mulq     0x48(%rbp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0xb0(%rsp), %rax
+        mulq     0x40(%rbp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0xa0(%rsp), %rax
+        mulq     0x58(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0xa8(%rsp), %rax
+        mulq     0x50(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb0(%rsp), %rax
+        mulq     0x48(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb8(%rsp), %rax
+        mulq     0x40(%rbp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq     0x58(%rbp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq     0x50(%rbp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0xb8(%rsp), %rax
+        mulq     0x48(%rbp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0xb0(%rsp), %rax
+        mulq     0x58(%rbp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0x50(%rbp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0x58(%rbp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    0x20(%rsp), %rax
+        mulq     0x80(%rsp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0x20(%rsp), %rax
+        mulq     0x88(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x28(%rsp), %rax
+        mulq     0x80(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0x20(%rsp), %rax
+        mulq     0x90(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x28(%rsp), %rax
+        mulq     0x88(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x30(%rsp), %rax
+        mulq     0x80(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0x20(%rsp), %rax
+        mulq     0x98(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x28(%rsp), %rax
+        mulq     0x90(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x30(%rsp), %rax
+        mulq     0x88(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x38(%rsp), %rax
+        mulq     0x80(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x28(%rsp), %rax
+        mulq     0x98(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x30(%rsp), %rax
+        mulq     0x90(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x38(%rsp), %rax
+        mulq     0x88(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x30(%rsp), %rax
+        mulq     0x98(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x38(%rsp), %rax
+        mulq     0x90(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x38(%rsp), %rax
+        mulq     0x98(%rsp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x80(%rsp), %rax
+        subq    0x60(%rsp), %rax
+        movq    0x88(%rsp), %rcx
+        sbbq    0x68(%rsp), %rcx
+        movq    0x90(%rsp), %r8
+        sbbq    0x70(%rsp), %r8
+        movq    0x98(%rsp), %r9
+        sbbq    0x78(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x80(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x88(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x90(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x98(%rsp)
+        movq    0x40(%rsi), %r8
+        movq    0x48(%rsi), %r9
+        movq    0x50(%rsi), %r10
+        movq    0x58(%rsi), %r11
+        movq    %r8, %rax
+        movq    %r9, %rdx
+        orq     %r10, %rax
+        orq     %r11, %rdx
+        orq     %rdx, %rax
+        negq    %rax
+        sbbq    %rax, %rax
+        movq    0x40(%rbp), %r12
+        movq    0x48(%rbp), %r13
+        movq    0x50(%rbp), %r14
+        movq    0x58(%rbp), %r15
+        movq    %r12, %rbx
+        movq    %r13, %rdx
+        orq     %r14, %rbx
+        orq     %r15, %rdx
+        orq     %rdx, %rbx
+        negq    %rbx
+        sbbq    %rbx, %rbx
+        cmpq    %rax, %rbx
+        cmovbq  %r8, %r12
+        cmovbq  %r9, %r13
+        cmovbq  %r10, %r14
+        cmovbq  %r11, %r15
+        cmoveq  0xa0(%rsp), %r12
+        cmoveq  0xa8(%rsp), %r13
+        cmoveq  0xb0(%rsp), %r14
+        cmoveq  0xb8(%rsp), %r15
+        movq    (%rsp), %rax
+        cmovbq  (%rsi), %rax
+        cmova   0x0(%rbp), %rax
+        movq    0x8(%rsp), %rbx
+        cmovbq  0x8(%rsi), %rbx
+        cmova   0x8(%rbp), %rbx
+        movq    0x10(%rsp), %rcx
+        cmovbq  0x10(%rsi), %rcx
+        cmova   0x10(%rbp), %rcx
+        movq    0x18(%rsp), %rdx
+        cmovbq  0x18(%rsi), %rdx
+        cmova   0x18(%rbp), %rdx
+        movq    0x80(%rsp), %r8
+        cmovbq  0x20(%rsi), %r8
+        cmova   0x20(%rbp), %r8
+        movq    0x88(%rsp), %r9
+        cmovbq  0x28(%rsi), %r9
+        cmova   0x28(%rbp), %r9
+        movq    0x90(%rsp), %r10
+        cmovbq  0x30(%rsi), %r10
+        cmova   0x30(%rbp), %r10
+        movq    0x98(%rsp), %r11
+        cmovbq  0x38(%rsi), %r11
+        cmova   0x38(%rbp), %r11
+        movq    %rax, (%rdi)
+        movq    %rbx, 0x8(%rdi)
+        movq    %rcx, 0x10(%rdi)
+        movq    %rdx, 0x18(%rdi)
+        movq    %r8, 0x20(%rdi)
+        movq    %r9, 0x28(%rdi)
+        movq    %r10, 0x30(%rdi)
+        movq    %r11, 0x38(%rdi)
+        movq    %r12, 0x40(%rdi)
+        movq    %r13, 0x48(%rdi)
+        movq    %r14, 0x50(%rdi)
+        movq    %r15, 0x58(%rdi)
+        addq    $0xe0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbp
+        popq    %rbx
+        ret
+
+sm2_montjscalarmul_alt_sm2_montjdouble:
+        pushq   %rbx
+        pushq   %r12
+        pushq   %r13
+        pushq   %r14
+        pushq   %r15
+        subq    $0xc0, %rsp
+        movq    0x40(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rsi), %rbx
+        movq    0x48(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0x20(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x28(%rsi), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x38(%rsi), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x30(%rsi), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x20(%rsi), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x28(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x38(%rsi), %rbx
+        movq    0x28(%rsi), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x28(%rsi), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x30(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x38(%rsi), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x20(%rsp)
+        movq    %r13, 0x28(%rsp)
+        movq    %r14, 0x30(%rsp)
+        movq    %r15, 0x38(%rsp)
+        movq    (%rsi), %rax
+        subq    (%rsp), %rax
+        movq    0x8(%rsi), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x10(%rsi), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x18(%rsi), %r9
+        sbbq    0x18(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x60(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x68(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x70(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x78(%rsp)
+        movq    (%rsi), %rax
+        addq    (%rsp), %rax
+        movq    0x8(%rsi), %rcx
+        adcq    0x8(%rsp), %rcx
+        movq    0x10(%rsi), %r8
+        adcq    0x10(%rsp), %r8
+        movq    0x18(%rsi), %r9
+        adcq    0x18(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        subq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        sbbq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        sbbq    %r11, %r8
+        movq    %r8, 0x50(%rsp)
+        sbbq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        movq    0x40(%rsp), %rax
+        mulq     0x60(%rsp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0x40(%rsp), %rax
+        mulq     0x68(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x48(%rsp), %rax
+        mulq     0x60(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0x40(%rsp), %rax
+        mulq     0x70(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x48(%rsp), %rax
+        mulq     0x68(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x50(%rsp), %rax
+        mulq     0x60(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0x40(%rsp), %rax
+        mulq     0x78(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x48(%rsp), %rax
+        mulq     0x70(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x50(%rsp), %rax
+        mulq     0x68(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x58(%rsp), %rax
+        mulq     0x60(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x48(%rsp), %rax
+        mulq     0x78(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x50(%rsp), %rax
+        mulq     0x70(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x58(%rsp), %rax
+        mulq     0x68(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x50(%rsp), %rax
+        mulq     0x78(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x58(%rsp), %rax
+        mulq     0x70(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x58(%rsp), %rax
+        mulq     0x78(%rsp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        xorq    %r11, %r11
+        movq    0x20(%rsi), %rax
+        addq    0x40(%rsi), %rax
+        movq    0x28(%rsi), %rcx
+        adcq    0x48(%rsi), %rcx
+        movq    0x30(%rsi), %r8
+        adcq    0x50(%rsi), %r8
+        movq    0x38(%rsi), %r9
+        adcq    0x58(%rsi), %r9
+        adcq    %r11, %r11
+        subq    $0xffffffffffffffff, %rax
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r10, %rcx
+        sbbq    $0xffffffffffffffff, %r8
+        movabsq $0xfffffffeffffffff, %rdx
+        sbbq    %rdx, %r9
+        sbbq    $0x0, %r11
+        andq    %r11, %r10
+        andq    %r11, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x50(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        movq    (%rsi), %rax
+        mulq     0x20(%rsp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    (%rsi), %rax
+        mulq     0x28(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0x8(%rsi), %rax
+        mulq     0x20(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    (%rsi), %rax
+        mulq     0x30(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0x8(%rsi), %rax
+        mulq     0x28(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0x10(%rsi), %rax
+        mulq     0x20(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    (%rsi), %rax
+        mulq     0x38(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0x8(%rsi), %rax
+        mulq     0x30(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x10(%rsi), %rax
+        mulq     0x28(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0x18(%rsi), %rax
+        mulq     0x20(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0x8(%rsi), %rax
+        mulq     0x38(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0x10(%rsi), %rax
+        mulq     0x30(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0x18(%rsi), %rax
+        mulq     0x28(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0x10(%rsi), %rax
+        mulq     0x38(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0x18(%rsi), %rax
+        mulq     0x30(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0x18(%rsi), %rax
+        mulq     0x38(%rsp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x80(%rsp)
+        movq    %r13, 0x88(%rsp)
+        movq    %r14, 0x90(%rsp)
+        movq    %r15, 0x98(%rsp)
+        movq    0x60(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x78(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x70(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x60(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x78(%rsp), %rbx
+        movq    0x68(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x68(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x70(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x78(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0xa0(%rsp)
+        movq    %r13, 0xa8(%rsp)
+        movq    %r14, 0xb0(%rsp)
+        movq    %r15, 0xb8(%rsp)
+        movq    0x40(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x58(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x50(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x40(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x58(%rsp), %rbx
+        movq    0x48(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x48(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x50(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x58(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x40(%rsp)
+        movq    %r13, 0x48(%rsp)
+        movq    %r14, 0x50(%rsp)
+        movq    %r15, 0x58(%rsp)
+        movq    $0xffffffffffffffff, %r9
+        movq    %r9, %r11
+        subq    0xa0(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    0xa8(%rsp), %r10
+        sbbq    0xb0(%rsp), %r11
+        movabsq $0xfffffffeffffffff, %r12
+        sbbq    0xb8(%rsp), %r12
+        movq    $0x9, %rcx
+        movq    %r9, %rax
+        mulq    %rcx
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        movq    %r10, %rax
+        xorl    %r10d, %r10d
+        mulq    %rcx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    %r11, %rax
+        xorl    %r11d, %r11d
+        mulq    %rcx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        movq    %r12, %rax
+        xorl    %r12d, %r12d
+        mulq    %rcx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        movl    $0xc, %ecx
+        movq    0x80(%rsp), %rax
+        mulq    %rcx
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %rbx, %rbx
+        movq    0x88(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rbx, %rbx
+        movq    0x90(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rbx, %rbx
+        movq    0x98(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        leaq    0x1(%r12), %rdx
+        movq    %rdx, %rax
+        shlq    $0x20, %rax
+        movq    %rax, %rcx
+        subq    %rdx, %rax
+        addq    %rdx, %r8
+        adcq    %rax, %r9
+        adcq    $0x0, %r10
+        adcq    %rcx, %r11
+        sbbq    %rdx, %rdx
+        notq    %rdx
+        movabsq $0xffffffff00000000, %rax
+        andq    %rdx, %rax
+        movq    %rdx, %rcx
+        btr     $0x20, %rcx
+        addq    %rdx, %r8
+        movq    %r8, 0xa0(%rsp)
+        adcq    %rax, %r9
+        movq    %r9, 0xa8(%rsp)
+        adcq    %rdx, %r10
+        movq    %r10, 0xb0(%rsp)
+        adcq    %rcx, %r11
+        movq    %r11, 0xb8(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    (%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x8(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x10(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x18(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rsp)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rsp)
+        adcq    %r11, %r8
+        movq    %r8, 0x50(%rsp)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rsp)
+        movq    0x20(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %rax
+        movq    %rax, %r8
+        movq    %rdx, %r15
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        movq    %rax, %r9
+        movq    %rdx, %r10
+        movq    0x38(%rsp), %rax
+        movq    %rax, %r13
+        mulq    %rbx
+        movq    %rax, %r11
+        movq    %rdx, %r12
+        movq    0x30(%rsp), %rax
+        movq    %rax, %rbx
+        mulq    %r13
+        movq    %rax, %r13
+        movq    %rdx, %r14
+        movq    0x20(%rsp), %rax
+        mulq    %rbx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        sbbq    %rcx, %rcx
+        movq    0x38(%rsp), %rbx
+        movq    0x28(%rsp), %rax
+        mulq    %rbx
+        subq    %rcx, %rdx
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorl    %ecx, %ecx
+        addq    %r9, %r9
+        adcq    %r10, %r10
+        adcq    %r11, %r11
+        adcq    %r12, %r12
+        adcq    %r13, %r13
+        adcq    %r14, %r14
+        adcq    %rcx, %rcx
+        movq    0x28(%rsp), %rax
+        mulq    %rax
+        addq    %r15, %r9
+        adcq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %r15, %r15
+        movq    0x30(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r12
+        adcq    %rdx, %r13
+        sbbq    %r15, %r15
+        movq    0x38(%rsp), %rax
+        mulq    %rax
+        negq    %r15
+        adcq    %rax, %r14
+        adcq    %rcx, %rdx
+        movq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, (%rsp)
+        movq    %r13, 0x8(%rsp)
+        movq    %r14, 0x10(%rsp)
+        movq    %r15, 0x18(%rsp)
+        movq    0xa0(%rsp), %rax
+        mulq     0x60(%rsp)
+        movq    %rax, %r8
+        movq    %rdx, %r9
+        xorq    %r10, %r10
+        xorq    %r11, %r11
+        movq    0xa0(%rsp), %rax
+        mulq     0x68(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        movq    0xa8(%rsp), %rax
+        mulq     0x60(%rsp)
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        adcq    %r11, %r11
+        xorq    %r12, %r12
+        movq    0xa0(%rsp), %rax
+        mulq     0x70(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    %r12, %r12
+        movq    0xa8(%rsp), %rax
+        mulq     0x68(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        movq    0xb0(%rsp), %rax
+        mulq     0x60(%rsp)
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        adcq    $0x0, %r12
+        xorq    %r13, %r13
+        movq    0xa0(%rsp), %rax
+        mulq     0x78(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    %r13, %r13
+        movq    0xa8(%rsp), %rax
+        mulq     0x70(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb0(%rsp), %rax
+        mulq     0x68(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        movq    0xb8(%rsp), %rax
+        mulq     0x60(%rsp)
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        adcq    $0x0, %r13
+        xorq    %r14, %r14
+        movq    0xa8(%rsp), %rax
+        mulq     0x78(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    %r14, %r14
+        movq    0xb0(%rsp), %rax
+        mulq     0x70(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        movq    0xb8(%rsp), %rax
+        mulq     0x68(%rsp)
+        addq    %rax, %r12
+        adcq    %rdx, %r13
+        adcq    $0x0, %r14
+        xorq    %r15, %r15
+        movq    0xb0(%rsp), %rax
+        mulq     0x78(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    %r15, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0x70(%rsp)
+        addq    %rax, %r13
+        adcq    %rdx, %r14
+        adcq    $0x0, %r15
+        movq    0xb8(%rsp), %rax
+        mulq     0x78(%rsp)
+        addq    %rax, %r14
+        adcq    %rdx, %r15
+        movq    %r8, %rax
+        shlq    $0x20, %rax
+        movq    %r8, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r8, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r9
+        sbbq    %rcx, %r10
+        sbbq    %rdx, %r11
+        sbbq    %rbx, %r8
+        movq    %r9, %rax
+        shlq    $0x20, %rax
+        movq    %r9, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r9, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r10
+        sbbq    %rcx, %r11
+        sbbq    %rdx, %r8
+        sbbq    %rbx, %r9
+        movq    %r10, %rax
+        shlq    $0x20, %rax
+        movq    %r10, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r10, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r11
+        sbbq    %rcx, %r8
+        sbbq    %rdx, %r9
+        sbbq    %rbx, %r10
+        movq    %r11, %rax
+        shlq    $0x20, %rax
+        movq    %r11, %rcx
+        shrq    $0x20, %rcx
+        movq    %rax, %rdx
+        movq    %rcx, %rbx
+        subq    %r11, %rax
+        sbbq    $0x0, %rcx
+        subq    %rax, %r8
+        sbbq    %rcx, %r9
+        sbbq    %rdx, %r10
+        sbbq    %rbx, %r11
+        xorl    %eax, %eax
+        addq    %r8, %r12
+        adcq    %r9, %r13
+        adcq    %r10, %r14
+        adcq    %r11, %r15
+        adcq    %rax, %rax
+        movl    $0x1, %ecx
+        movl    $0xffffffff, %edx
+        xorl    %ebx, %ebx
+        addq    %r12, %rcx
+        leaq    0x1(%rdx), %r11
+        adcq    %r13, %rdx
+        leaq    -0x1(%rbx), %r8
+        adcq    %r14, %rbx
+        adcq    %r15, %r11
+        adcq    %rax, %r8
+        cmovbq  %rcx, %r12
+        cmovbq  %rdx, %r13
+        cmovbq  %rbx, %r14
+        cmovbq  %r11, %r15
+        movq    %r12, 0x60(%rsp)
+        movq    %r13, 0x68(%rsp)
+        movq    %r14, 0x70(%rsp)
+        movq    %r15, 0x78(%rsp)
+        movq    0x40(%rsp), %rax
+        subq    0x20(%rsp), %rax
+        movq    0x48(%rsp), %rcx
+        sbbq    0x28(%rsp), %rcx
+        movq    0x50(%rsp), %r8
+        sbbq    0x30(%rsp), %r8
+        movq    0x58(%rsp), %r9
+        sbbq    0x38(%rsp), %r9
+        movabsq $0xffffffff00000000, %r10
+        sbbq    %r11, %r11
+        andq    %r11, %r10
+        movq    %r11, %rdx
+        btr     $0x20, %rdx
+        addq    %r11, %rax
+        movq    %rax, 0x40(%rdi)
+        adcq    %r10, %rcx
+        movq    %rcx, 0x48(%rdi)
+        adcq    %r11, %r8
+        movq    %r8, 0x50(%rdi)
+        adcq    %rdx, %r9
+        movq    %r9, 0x58(%rdi)
+        movq    0x98(%rsp), %r11
+        movq    %r11, %rdx
+        movq    0x90(%rsp), %r10
+        shldq   $0x2, %r10, %r11
+        movq    0x88(%rsp), %r9
+        shldq   $0x2, %r9, %r10
+        movq    0x80(%rsp), %r8
+        shldq   $0x2, %r8, %r9
+        shlq    $0x2, %r8
+        shrq    $0x3e, %rdx
+        addq    $0x1, %rdx
+        subq    0xa0(%rsp), %r8
+        sbbq    0xa8(%rsp), %r9
+        sbbq    0xb0(%rsp), %r10
+        sbbq    0xb8(%rsp), %r11
+        sbbq    $0x0, %rdx
+        movq    %rdx, %rax
+        shlq    $0x20, %rax
+        movq    %rax, %rcx
+        subq    %rdx, %rax
+        addq    %rdx, %r8
+        adcq    %rax, %r9
+        adcq    $0x0, %r10
+        adcq    %rcx, %r11
+        sbbq    %rdx, %rdx
+        notq    %rdx
+        movabsq $0xffffffff00000000, %rax
+        andq    %rdx, %rax
+        movq    %rdx, %rcx
+        btr     $0x20, %rcx
+        addq    %rdx, %r8
+        movq    %r8, (%rdi)
+        adcq    %rax, %r9
+        movq    %r9, 0x8(%rdi)
+        adcq    %rdx, %r10
+        movq    %r10, 0x10(%rdi)
+        adcq    %rcx, %r11
+        movq    %r11, 0x18(%rdi)
+        movq    $0xffffffffffffffff, %r8
+        movq    %r8, %r10
+        subq    (%rsp), %r8
+        movabsq $0xffffffff00000000, %r9
+        sbbq    0x8(%rsp), %r9
+        sbbq    0x10(%rsp), %r10
+        movabsq $0xfffffffeffffffff, %r11
+        sbbq    0x18(%rsp), %r11
+        movq    %r11, %r12
+        shldq   $0x3, %r10, %r11
+        shldq   $0x3, %r9, %r10
+        shldq   $0x3, %r8, %r9
+        shlq    $0x3, %r8
+        shrq    $0x3d, %r12
+        movl    $0x3, %ecx
+        movq    0x60(%rsp), %rax
+        mulq    %rcx
+        addq    %rax, %r8
+        adcq    %rdx, %r9
+        sbbq    %rbx, %rbx
+        movq    0x68(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r9
+        adcq    %rdx, %r10
+        sbbq    %rbx, %rbx
+        movq    0x70(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r10
+        adcq    %rdx, %r11
+        sbbq    %rbx, %rbx
+        movq    0x78(%rsp), %rax
+        mulq    %rcx
+        subq    %rbx, %rdx
+        addq    %rax, %r11
+        adcq    %rdx, %r12
+        leaq    0x1(%r12), %rdx
+        movq    %rdx, %rax
+        shlq    $0x20, %rax
+        movq    %rax, %rcx
+        subq    %rdx, %rax
+        addq    %rdx, %r8
+        adcq    %rax, %r9
+        adcq    $0x0, %r10
+        adcq    %rcx, %r11
+        sbbq    %rdx, %rdx
+        notq    %rdx
+        movabsq $0xffffffff00000000, %rax
+        andq    %rdx, %rax
+        movq    %rdx, %rcx
+        btr     $0x20, %rcx
+        addq    %rdx, %r8
+        movq    %r8, 0x20(%rdi)
+        adcq    %rax, %r9
+        movq    %r9, 0x28(%rdi)
+        adcq    %rdx, %r10
+        movq    %r10, 0x30(%rdi)
+        adcq    %rcx, %r11
+        movq    %r11, 0x38(%rdi)
+        addq    $0xc0, %rsp
+        popq    %r15
+        popq    %r14
+        popq    %r13
+        popq    %r12
+        popq    %rbx
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits
+#endif
diff --git a/third_party/s2n-bignum/s2n-bignum_aws-lc.h b/third_party/s2n-bignum/s2n-bignum_aws-lc.h
new file mode 100644
index 00000000000..a08696d4ace
--- /dev/null
+++ b/third_party/s2n-bignum/s2n-bignum_aws-lc.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License").
+ * You may not use this file except in compliance with the License.
+ * A copy of the License is located at
+ *
+ *  http://aws.amazon.com/apache2.0
+ *
+ * or in the "LICENSE" file accompanying this file. This file is distributed
+ * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+#ifndef S2N_BIGNUM_AWS_LC_H
+#define S2N_BIGNUM_AWS_LC_H
+
+#include "s2n-bignum-imported/include/s2n-bignum.h"
+
+// ----------------------------------------------------------------------------
+// C prototypes for s2n-bignum functions used in AWS-LC
+// ----------------------------------------------------------------------------
+
+// For some functions there are additional variants with names ending in
+// "_alt". These have the same core mathematical functionality as their
+// non-"alt" versions, but can be better suited to some microarchitectures:
+//
+//      - On x86, the "_alt" forms avoid BMI and ADX instruction set
+//        extensions, so will run on any x86_64 machine, even older ones
+//
+//      - On ARM, the "_alt" forms target machines with higher multiplier
+//        throughput, generally offering higher performance there.
+// For each of those, we define a _selector function that selects, in runtime,
+// the _alt or non-_alt version to run.
+
+#if defined(OPENSSL_X86_64)
+// On x86_64 platforms s2n-bignum uses bmi2 and adx instruction sets
+// for some of the functions. These instructions are not supported by
+// every x86 CPU so we have to check if they are available and in case
+// they are not we fallback to slightly slower but generic implementation.
+static inline uint8_t use_s2n_bignum_alt(void) {
+  return (!CRYPTO_is_BMI2_capable() || !CRYPTO_is_ADX_capable());
+}
+#else
+// On aarch64 platforms s2n-bignum has two implementations of certain
+// functions -- the default one and the alternative (suffixed _alt).
+// Depending on the architecture one version is faster than the other.
+// Generally, the "_alt" functions are faster on architectures with higher
+// multiplier throughput, for example, Graviton 3, Apple's M1 and iPhone chips.
+static inline uint8_t use_s2n_bignum_alt(void) {
+  return CRYPTO_is_ARMv8_wide_multiplier_capable();
+}
+#endif
+
+#define S2NBIGNUM_KSQR_16_32_TEMP_NWORDS 24
+#define S2NBIGNUM_KMUL_16_32_TEMP_NWORDS 32
+#define S2NBIGNUM_KSQR_32_64_TEMP_NWORDS 72
+#define S2NBIGNUM_KMUL_32_64_TEMP_NWORDS 96
+
+static inline void p256_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]) {
+  if (use_s2n_bignum_alt()) { p256_montjscalarmul_alt(res, scalar, point); }
+  else { p256_montjscalarmul(res, scalar, point); }
+}
+
+static inline void bignum_deamont_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) {
+  if (use_s2n_bignum_alt()) { bignum_deamont_p384_alt(z, x); }
+  else { bignum_deamont_p384(z, x); }
+}
+
+static inline void bignum_montmul_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]) {
+  if (use_s2n_bignum_alt()) { bignum_montmul_p384_alt(z, x, y); }
+  else { bignum_montmul_p384(z, x, y); }
+}
+
+static inline void bignum_montsqr_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) {
+  if (use_s2n_bignum_alt()) { bignum_montsqr_p384_alt(z, x); }
+  else { bignum_montsqr_p384(z, x); }
+}
+
+static inline void bignum_tomont_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) {
+  if (use_s2n_bignum_alt()) { bignum_tomont_p384_alt(z, x); }
+  else { bignum_tomont_p384(z, x); }
+}
+
+static inline void p384_montjdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 18],uint64_t p1[S2N_BIGNUM_STATIC 18]) {
+    if (use_s2n_bignum_alt()) { p384_montjdouble_alt(p3, p1); }
+    else { p384_montjdouble(p3, p1); }
+}
+
+static inline void p384_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]) {
+  if (use_s2n_bignum_alt()) { p384_montjscalarmul_alt(res, scalar, point); }
+  else { p384_montjscalarmul(res, scalar, point); }
+}
+
+static inline void bignum_mul_p521_selector(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]) {
+  if (use_s2n_bignum_alt()) { bignum_mul_p521_alt(z, x, y); }
+  else { bignum_mul_p521(z, x, y); }
+}
+
+static inline void bignum_sqr_p521_selector(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]) {
+  if (use_s2n_bignum_alt()) { bignum_sqr_p521_alt(z, x); }
+  else { bignum_sqr_p521(z, x); }
+}
+
+static inline void p521_jdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]) {
+    if (use_s2n_bignum_alt()) { p521_jdouble_alt(p3, p1); }
+    else { p521_jdouble(p3, p1); }
+}
+
+static inline void p521_jscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]) {
+    if (use_s2n_bignum_alt()) { p521_jscalarmul_alt(res, scalar, point); }
+    else { p521_jscalarmul(res, scalar, point); }
+}
+
+static inline void curve25519_x25519_byte_selector(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32], const uint8_t point[S2N_BIGNUM_STATIC 32]) {
+  if (use_s2n_bignum_alt()) { curve25519_x25519_byte_alt(res, scalar, point); }
+  else { curve25519_x25519_byte(res, scalar, point); }
+}
+
+static inline void curve25519_x25519base_byte_selector(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32]) {
+  if (use_s2n_bignum_alt()) { curve25519_x25519base_byte_alt(res, scalar); }
+  else { curve25519_x25519base_byte(res, scalar); }
+}
+
+static inline void bignum_madd_n25519_selector(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4], uint64_t y[S2N_BIGNUM_STATIC 4], uint64_t c[S2N_BIGNUM_STATIC 4]) {
+  if (use_s2n_bignum_alt()) { bignum_madd_n25519_alt(z, x, y, c); }
+  else { bignum_madd_n25519(z, x, y, c); }
+}
+
+static inline uint64_t edwards25519_decode_selector(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]) {
+  if (use_s2n_bignum_alt()) { return edwards25519_decode_alt(z, c); }
+  else { return edwards25519_decode(z, c); }
+}
+
+static inline void edwards25519_scalarmulbase_selector(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4]) {
+  if (use_s2n_bignum_alt()) { edwards25519_scalarmulbase_alt(res, scalar); }
+  else { edwards25519_scalarmulbase(res, scalar); }
+}
+
+static inline void edwards25519_scalarmuldouble_selector(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 8], uint64_t bscalar[S2N_BIGNUM_STATIC 4]) {
+  if (use_s2n_bignum_alt()) { edwards25519_scalarmuldouble_alt(res, scalar, point, bscalar); }
+  else { edwards25519_scalarmuldouble(res, scalar, point, bscalar); }
+}
+
+#endif // S2N_BIGNUM_AWS_LC_H